diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,175783 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999146539216524, + "eval_steps": 500, + "global_step": 5858, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1740.0, + "completions/mean_length": 699.97265625, + "completions/mean_terminated_length": 650.854248046875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.00017069215669539986, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.6960226457492601, + "kl": 0.026153564453125, + "learning_rate": 0.0, + "loss": 0.0023, + "num_tokens": 221305.0, + "reward": 1.7587890625, + "reward_std": 0.520090639591217, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 0.70703125, + "rewards/format_reward/std": 0.45601576566696167, + "rewards/tag_count_reward/mean": 0.9072265625, + "rewards/tag_count_reward/std": 0.17555660009384155, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 664.51171875, + "completions/mean_terminated_length": 659.0863037109375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.0003413843133907997, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.6672450834093814, + "kl": 0.02294921875, + "learning_rate": 3.412969283276451e-08, + "loss": 0.0024, + "num_tokens": 433548.0, + "reward": 1.7197265625, + "reward_std": 0.48479723930358887, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.76953125, + "rewards/format_reward/std": 0.4219578504562378, + "rewards/tag_count_reward/mean": 0.9267578125, + "rewards/tag_count_reward/std": 0.14771738648414612, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 712.08984375, + "completions/mean_terminated_length": 706.8510131835938, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.0005120764700861995, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.7113307871140736, + "kl": 0.0203857421875, + "learning_rate": 6.825938566552902e-08, + "loss": 0.0377, + "num_tokens": 654131.0, + "reward": 1.6728515625, + "reward_std": 0.5393630266189575, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 0.7109375, + "rewards/format_reward/std": 0.45421501994132996, + "rewards/tag_count_reward/mean": 0.8994140625, + "rewards/tag_count_reward/std": 0.19899170100688934, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1951.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 765.7421875, + "completions/mean_terminated_length": 765.7421875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.0006827686267815994, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.49369061368759376, + "kl": 0.021728515625, + "learning_rate": 1.0238907849829352e-07, + "loss": 0.0133, + "num_tokens": 890737.0, + "reward": 1.787109375, + "reward_std": 0.5448973178863525, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.7578125, + "rewards/format_reward/std": 0.4292463958263397, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.14077810943126678, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1679.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 668.87890625, + "completions/mean_terminated_length": 668.87890625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.0008534607834769992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.795810338945291, + "kl": 0.0206298828125, + "learning_rate": 1.3651877133105803e-07, + "loss": 0.0155, + "num_tokens": 1102994.0, + "reward": 1.8330078125, + "reward_std": 0.581057071685791, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.73046875, + "rewards/format_reward/std": 0.44458550214767456, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.1370389759540558, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 654.87109375, + "completions/mean_terminated_length": 649.4078979492188, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.001024152940172399, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.6273212771903427, + "kl": 0.02484130859375, + "learning_rate": 1.7064846416382255e-07, + "loss": -0.0138, + "num_tokens": 1317121.0, + "reward": 1.75390625, + "reward_std": 0.48795753717422485, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.7734375, + "rewards/format_reward/std": 0.41942715644836426, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.21579766273498535, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 750.765625, + "completions/mean_terminated_length": 745.678466796875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.0011948450968677989, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.4345901138589758, + "kl": 0.021697998046875, + "learning_rate": 2.0477815699658704e-07, + "loss": -0.0007, + "num_tokens": 1556389.0, + "reward": 1.921875, + "reward_std": 0.5260998010635376, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.8359375, + "rewards/format_reward/std": 0.3710577189922333, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.18519829213619232, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 779.62890625, + "completions/mean_terminated_length": 754.362548828125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.0013655372535631989, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.5976821954689728, + "kl": 0.022064208984375, + "learning_rate": 2.3890784982935155e-07, + "loss": 0.0433, + "num_tokens": 1794934.0, + "reward": 1.466796875, + "reward_std": 0.5314927101135254, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.1078278198838234, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49209436774253845, + "rewards/tag_count_reward/mean": 0.861328125, + "rewards/tag_count_reward/std": 0.20842216908931732, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1680.0, + "completions/max_terminated_length": 1680.0, + "completions/mean_length": 689.6796875, + "completions/mean_terminated_length": 689.6796875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.0015362294102585986, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.5107001211446651, + "kl": 0.018280029296875, + "learning_rate": 2.7303754266211607e-07, + "loss": 0.0175, + "num_tokens": 2013748.0, + "reward": 1.802734375, + "reward_std": 0.4827074110507965, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.76171875, + "rewards/format_reward/std": 0.4268665909767151, + "rewards/tag_count_reward/mean": 0.927734375, + "rewards/tag_count_reward/std": 0.14569759368896484, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1698.0, + "completions/mean_length": 665.43359375, + "completions/mean_terminated_length": 649.03955078125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.0017069215669539984, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.525378472185575, + "kl": 0.024810791015625, + "learning_rate": 3.071672354948806e-07, + "loss": 0.0445, + "num_tokens": 2228979.0, + "reward": 1.7666015625, + "reward_std": 0.45018503069877625, + "rewards/accuracy_reward/mean": 0.03750000149011612, + "rewards/accuracy_reward/std": 0.19038060307502747, + "rewards/format_reward/mean": 0.79296875, + "rewards/format_reward/std": 0.40597182512283325, + "rewards/tag_count_reward/mean": 0.9384765625, + "rewards/tag_count_reward/std": 0.13958631455898285, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 634.7734375, + "completions/mean_terminated_length": 612.34130859375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.0018776137236493984, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.6917640871256874, + "kl": 0.02581787109375, + "learning_rate": 3.412969283276451e-07, + "loss": 0.0658, + "num_tokens": 2441433.0, + "reward": 1.72265625, + "reward_std": 0.4807218313217163, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 0.7265625, + "rewards/format_reward/std": 0.446596622467041, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.17499125003814697, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 666.25390625, + "completions/mean_terminated_length": 649.8695678710938, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.002048305880344798, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.4882774647015533, + "kl": 0.026519775390625, + "learning_rate": 3.754266211604096e-07, + "loss": 0.0329, + "num_tokens": 2657098.0, + "reward": 1.9501953125, + "reward_std": 0.36687296628952026, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3483152687549591, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.11900047957897186, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 772.16796875, + "completions/mean_terminated_length": 767.1647338867188, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.002218998037040198, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.35021850059197407, + "kl": 0.0189208984375, + "learning_rate": 4.0955631399317407e-07, + "loss": -0.0105, + "num_tokens": 2892293.0, + "reward": 1.9736328125, + "reward_std": 0.28106746077537537, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.28082075715065, + "rewards/tag_count_reward/mean": 0.9775390625, + "rewards/tag_count_reward/std": 0.07497318834066391, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 655.3515625, + "completions/mean_terminated_length": 649.8902587890625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.0023896901937355977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.38287855807929966, + "kl": 0.022796630859375, + "learning_rate": 4.436860068259386e-07, + "loss": 0.0063, + "num_tokens": 3100559.0, + "reward": 1.9033203125, + "reward_std": 0.2304399013519287, + "rewards/accuracy_reward/mean": 0.02916666679084301, + "rewards/accuracy_reward/std": 0.16862517595291138, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.28082075715065, + "rewards/tag_count_reward/mean": 0.9619140625, + "rewards/tag_count_reward/std": 0.1573825478553772, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1651.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 660.8828125, + "completions/mean_terminated_length": 660.8828125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.0025603823504309975, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.32308384087003555, + "kl": 0.02117919921875, + "learning_rate": 4.778156996587031e-07, + "loss": 0.0254, + "num_tokens": 3312273.0, + "reward": 2.013671875, + "reward_std": 0.30645668506622314, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 0.94140625, + "rewards/format_reward/std": 0.23532284796237946, + "rewards/tag_count_reward/mean": 0.982421875, + "rewards/tag_count_reward/std": 0.08392103016376495, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 670.453125, + "completions/mean_terminated_length": 670.453125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.0027310745071263977, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3302813183438955, + "kl": 0.020111083984375, + "learning_rate": 5.119453924914676e-07, + "loss": 0.0279, + "num_tokens": 3523717.0, + "reward": 2.041015625, + "reward_std": 0.2740479111671448, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.19412322342395782, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.05334262177348137, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1646.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 645.13671875, + "completions/mean_terminated_length": 645.13671875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.0029017666638217975, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.38555452275081475, + "kl": 0.0184326171875, + "learning_rate": 5.460750853242321e-07, + "loss": 0.0176, + "num_tokens": 3728328.0, + "reward": 2.134765625, + "reward_std": 0.3602128028869629, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.387910932302475, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.19412322342395782, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.048530805855989456, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1844.0, + "completions/max_terminated_length": 1844.0, + "completions/mean_length": 633.48046875, + "completions/mean_terminated_length": 633.48046875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.0030724588205171973, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2965742288423524, + "kl": 0.024078369140625, + "learning_rate": 5.802047781569966e-07, + "loss": 0.0003, + "num_tokens": 3930467.0, + "reward": 2.0146484375, + "reward_std": 0.15440350770950317, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.034663453698158264, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 761.21484375, + "completions/mean_terminated_length": 756.168701171875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.003243150977212597, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2565676196887217, + "kl": 0.020477294921875, + "learning_rate": 6.143344709897612e-07, + "loss": 0.0231, + "num_tokens": 4164794.0, + "reward": 2.001953125, + "reward_std": 0.19631868600845337, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.18453538417816162, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.07277647405862808, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 709.3828125, + "completions/mean_terminated_length": 688.1349487304688, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.003413843133907997, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.22361027824404636, + "kl": 0.018218994140625, + "learning_rate": 6.484641638225256e-07, + "loss": 0.0297, + "num_tokens": 4384508.0, + "reward": 2.015625, + "reward_std": 0.15065248310565948, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.09319689869880676, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 737.25390625, + "completions/mean_terminated_length": 726.9330444335938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.0035845352906033966, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.23863480966267778, + "kl": 0.019378662109375, + "learning_rate": 6.825938566552902e-07, + "loss": 0.0033, + "num_tokens": 4612685.0, + "reward": 2.1064453125, + "reward_std": 0.2640492916107178, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.05603000521659851, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 776.3125, + "completions/mean_terminated_length": 761.2332153320312, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.003755227447298797, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22593832211459058, + "kl": 0.020538330078125, + "learning_rate": 7.167235494880546e-07, + "loss": 0.0514, + "num_tokens": 4853197.0, + "reward": 2.1123046875, + "reward_std": 0.19619695842266083, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 764.1875, + "completions/mean_terminated_length": 759.1530151367188, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.003925919603994197, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18614741438601742, + "kl": 0.017608642578125, + "learning_rate": 7.508532423208192e-07, + "loss": 0.0177, + "num_tokens": 5087581.0, + "reward": 2.0986328125, + "reward_std": 0.1621382087469101, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 717.40625, + "completions/mean_terminated_length": 717.40625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.004096611760689596, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.24793208759564878, + "kl": 0.020660400390625, + "learning_rate": 7.849829351535837e-07, + "loss": -0.0111, + "num_tokens": 5309493.0, + "reward": 2.021484375, + "reward_std": 0.11961115896701813, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17433346807956696, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 656.84765625, + "completions/mean_terminated_length": 651.3922119140625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.004267303917384996, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2329236698609555, + "kl": 0.026153564453125, + "learning_rate": 8.191126279863481e-07, + "loss": 0.017, + "num_tokens": 5516414.0, + "reward": 2.04296875, + "reward_std": 0.1254390925168991, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1861.0, + "completions/max_terminated_length": 1861.0, + "completions/mean_length": 655.07421875, + "completions/mean_terminated_length": 655.07421875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.004437996074080396, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.3078449386385677, + "kl": 0.025604248046875, + "learning_rate": 8.532423208191128e-07, + "loss": -0.0065, + "num_tokens": 5726785.0, + "reward": 2.16015625, + "reward_std": 0.23424021899700165, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 731.96875, + "completions/mean_terminated_length": 726.807861328125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.004608688230775796, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3042436286216913, + "kl": 0.02325439453125, + "learning_rate": 8.873720136518772e-07, + "loss": 0.0006, + "num_tokens": 5957497.0, + "reward": 2.0673828125, + "reward_std": 0.20305249094963074, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.07797780632972717, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 764.58203125, + "completions/mean_terminated_length": 754.4763793945312, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.0047793803874711955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16970276633132267, + "kl": 0.018951416015625, + "learning_rate": 9.215017064846417e-07, + "loss": 0.0286, + "num_tokens": 6192798.0, + "reward": 2.005859375, + "reward_std": 0.11850108206272125, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 817.80078125, + "completions/mean_terminated_length": 812.9765014648438, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.004950072544166595, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.27315396467401004, + "kl": 0.018798828125, + "learning_rate": 9.556313993174062e-07, + "loss": 0.0127, + "num_tokens": 6446187.0, + "reward": 2.201171875, + "reward_std": 0.32791823148727417, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.09076117724180222, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 648.19921875, + "completions/mean_terminated_length": 642.7098388671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.005120764700861995, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3352510313415463, + "kl": 0.019134521484375, + "learning_rate": 9.897610921501708e-07, + "loss": 0.0116, + "num_tokens": 6650878.0, + "reward": 2.16796875, + "reward_std": 0.2905258536338806, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.39417871832847595, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.06957504153251648, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 1425.0, + "completions/mean_length": 685.34765625, + "completions/mean_terminated_length": 685.34765625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.005291456857557395, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.2677688303983964, + "kl": 0.02020263671875, + "learning_rate": 1.0238907849829352e-06, + "loss": 0.0148, + "num_tokens": 6866487.0, + "reward": 2.1748046875, + "reward_std": 0.3014334440231323, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.06436405330896378, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1784.0, + "completions/max_terminated_length": 1784.0, + "completions/mean_length": 666.828125, + "completions/mean_terminated_length": 666.828125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.0054621490142527955, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.317351360294854, + "kl": 0.0250244140625, + "learning_rate": 1.0580204778156999e-06, + "loss": 0.0032, + "num_tokens": 7077835.0, + "reward": 2.162109375, + "reward_std": 0.2198190689086914, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0539139099419117, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 781.69140625, + "completions/mean_terminated_length": 771.720458984375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.005632841170948195, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.26212350557246766, + "kl": 0.019378662109375, + "learning_rate": 1.0921501706484643e-06, + "loss": 0.0286, + "num_tokens": 7317628.0, + "reward": 2.1103515625, + "reward_std": 0.28060513734817505, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.9814453125, + "rewards/tag_count_reward/std": 0.12679094076156616, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1917.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 735.49609375, + "completions/mean_terminated_length": 735.49609375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.005803533327643595, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.31977311499237154, + "kl": 0.02655029296875, + "learning_rate": 1.1262798634812287e-06, + "loss": -0.0097, + "num_tokens": 7547595.0, + "reward": 2.31640625, + "reward_std": 0.25581619143486023, + "rewards/accuracy_reward/mean": 0.32421875, + "rewards/accuracy_reward/std": 0.46899911761283875, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 781.90625, + "completions/mean_terminated_length": 781.90625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.005974225484338995, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.32979701669896266, + "kl": 0.025634765625, + "learning_rate": 1.1604095563139933e-06, + "loss": -0.0028, + "num_tokens": 7795203.0, + "reward": 2.1708984375, + "reward_std": 0.28805747628211975, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40311288833618164, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9833984375, + "rewards/tag_count_reward/std": 0.11490735411643982, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1357.0, + "completions/mean_length": 748.59375, + "completions/mean_terminated_length": 743.4981079101562, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.006144917641034395, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.3192548355784668, + "kl": 0.0296630859375, + "learning_rate": 1.1945392491467577e-06, + "loss": 0.0257, + "num_tokens": 8027995.0, + "reward": 2.0791015625, + "reward_std": 0.19237858057022095, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.07797780632972717, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1874.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 782.2109375, + "completions/mean_terminated_length": 782.2109375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.006315609797729794, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2549704129783805, + "kl": 0.02056884765625, + "learning_rate": 1.2286689419795223e-06, + "loss": 0.0118, + "num_tokens": 8269969.0, + "reward": 2.1005859375, + "reward_std": 0.1964980810880661, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3268752694129944, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.08950243145227432, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1750.0, + "completions/mean_length": 833.31640625, + "completions/mean_terminated_length": 828.552978515625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.006486301954425194, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.2836156556784517, + "kl": 0.02264404296875, + "learning_rate": 1.2627986348122867e-06, + "loss": 0.0276, + "num_tokens": 8524546.0, + "reward": 2.115234375, + "reward_std": 0.30806592106819153, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.970703125, + "rewards/tag_count_reward/std": 0.1645585149526596, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 672.3828125, + "completions/mean_terminated_length": 666.98828125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.006656994111120594, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.28944072137018584, + "kl": 0.0262451171875, + "learning_rate": 1.2969283276450511e-06, + "loss": 0.0248, + "num_tokens": 8737956.0, + "reward": 2.00390625, + "reward_std": 0.2898312211036682, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.93359375, + "rewards/format_reward/std": 0.24947863817214966, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.2394847422838211, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1988.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 742.890625, + "completions/mean_terminated_length": 742.890625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.006827686267815994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.26014395894189263, + "kl": 0.022247314453125, + "learning_rate": 1.331058020477816e-06, + "loss": 0.0008, + "num_tokens": 8969704.0, + "reward": 2.0595703125, + "reward_std": 0.3531615436077118, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.9697265625, + "rewards/tag_count_reward/std": 0.1651248037815094, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1794.0, + "completions/max_terminated_length": 1794.0, + "completions/mean_length": 741.0625, + "completions/mean_terminated_length": 741.0625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.0069983784245113935, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.33434269768217983, + "kl": 0.022979736328125, + "learning_rate": 1.3651877133105804e-06, + "loss": 0.045, + "num_tokens": 9198984.0, + "reward": 2.1435546875, + "reward_std": 0.38347765803337097, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.9677734375, + "rewards/tag_count_reward/std": 0.16770224273204803, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 686.65234375, + "completions/mean_terminated_length": 658.4240112304688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.007169070581206793, + "frac_reward_zero_std": 0.375, + "grad_norm": 259.99462741774914, + "kl": 2.299072265625, + "learning_rate": 1.3993174061433448e-06, + "loss": 0.1441, + "num_tokens": 9414191.0, + "reward": 2.0478515625, + "reward_std": 0.31660234928131104, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.95703125, + "rewards/format_reward/std": 0.20318391919136047, + "rewards/tag_count_reward/mean": 0.9619140625, + "rewards/tag_count_reward/std": 0.1819411963224411, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1324.0, + "completions/mean_length": 707.90234375, + "completions/mean_terminated_length": 702.6470947265625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.007339762737902193, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.2815559658856631, + "kl": 0.02288818359375, + "learning_rate": 1.4334470989761092e-06, + "loss": 0.0052, + "num_tokens": 9637142.0, + "reward": 2.27734375, + "reward_std": 0.25420188903808594, + "rewards/accuracy_reward/mean": 0.30078125, + "rewards/accuracy_reward/std": 0.45949608087539673, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.10077822208404541, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 714.56640625, + "completions/mean_terminated_length": 704.0669555664062, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.007510454894597594, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.29848551328865974, + "kl": 0.02386474609375, + "learning_rate": 1.4675767918088738e-06, + "loss": 0.0465, + "num_tokens": 9866887.0, + "reward": 2.21875, + "reward_std": 0.29361671209335327, + "rewards/accuracy_reward/mean": 0.23828125, + "rewards/accuracy_reward/std": 0.4268665909767151, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.0794435366988182, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1917.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 729.1796875, + "completions/mean_terminated_length": 729.1796875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.0076811470512929934, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19169674605162618, + "kl": 0.02410888671875, + "learning_rate": 1.5017064846416384e-06, + "loss": 0.023, + "num_tokens": 10091845.0, + "reward": 2.125, + "reward_std": 0.2060512900352478, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.12426253408193588, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 769.4453125, + "completions/mean_terminated_length": 749.1508178710938, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.007851839207988393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.31778046639600505, + "kl": 0.024658203125, + "learning_rate": 1.5358361774744028e-06, + "loss": 0.0679, + "num_tokens": 10327815.0, + "reward": 2.2255859375, + "reward_std": 0.31923922896385193, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.4382871091365814, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9873046875, + "rewards/tag_count_reward/std": 0.09437597543001175, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1465.0, + "completions/mean_length": 673.421875, + "completions/mean_terminated_length": 657.12255859375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.008022531364683793, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2538539558367091, + "kl": 0.025665283203125, + "learning_rate": 1.5699658703071675e-06, + "loss": 0.0663, + "num_tokens": 10543475.0, + "reward": 2.1689453125, + "reward_std": 0.19735270738601685, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9892578125, + "rewards/tag_count_reward/std": 0.0836181566119194, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1442.0, + "completions/max_terminated_length": 1442.0, + "completions/mean_length": 656.0859375, + "completions/mean_terminated_length": 656.0859375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.008193223521379193, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.18750960301503622, + "kl": 0.028289794921875, + "learning_rate": 1.6040955631399319e-06, + "loss": 0.0036, + "num_tokens": 10753417.0, + "reward": 2.029296875, + "reward_std": 0.07281853258609772, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17433346807956696, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 757.453125, + "completions/mean_terminated_length": 752.3922119140625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.008363915678074593, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.22476886727806192, + "kl": 0.01910400390625, + "learning_rate": 1.6382252559726963e-06, + "loss": 0.0103, + "num_tokens": 10990285.0, + "reward": 2.173828125, + "reward_std": 0.17708057165145874, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.387910932302475, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1711.0, + "completions/max_terminated_length": 1711.0, + "completions/mean_length": 744.09375, + "completions/mean_terminated_length": 744.09375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.008534607834769992, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20727085090576203, + "kl": 0.02362060546875, + "learning_rate": 1.6723549488054607e-06, + "loss": -0.0089, + "num_tokens": 11220549.0, + "reward": 2.150390625, + "reward_std": 0.11181852966547012, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1646.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 530.2578125, + "completions/mean_terminated_length": 530.2578125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.008705299991465392, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3595829932026685, + "kl": 0.02984619140625, + "learning_rate": 1.7064846416382255e-06, + "loss": -0.008, + "num_tokens": 11398727.0, + "reward": 2.1943359375, + "reward_std": 0.16500356793403625, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 681.25390625, + "completions/mean_terminated_length": 675.8941650390625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.008875992148160792, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.3048027066665769, + "kl": 0.020263671875, + "learning_rate": 1.74061433447099e-06, + "loss": 0.0057, + "num_tokens": 11614856.0, + "reward": 2.4189453125, + "reward_std": 0.3608502447605133, + "rewards/accuracy_reward/mean": 0.4541666805744171, + "rewards/accuracy_reward/std": 0.49893540143966675, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 607.40234375, + "completions/mean_terminated_length": 607.40234375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.009046684304856192, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1927535663058829, + "kl": 0.02667236328125, + "learning_rate": 1.7747440273037543e-06, + "loss": 0.0027, + "num_tokens": 11809567.0, + "reward": 2.171875, + "reward_std": 0.10771197080612183, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1506.0, + "completions/mean_length": 615.24609375, + "completions/mean_terminated_length": 609.6275024414062, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.009217376461551591, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2978078187586373, + "kl": 0.0228271484375, + "learning_rate": 1.808873720136519e-06, + "loss": 0.0296, + "num_tokens": 12002766.0, + "reward": 2.095703125, + "reward_std": 0.16541722416877747, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 669.75, + "completions/mean_terminated_length": 664.3451538085938, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.009388068618246991, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2965230600885344, + "kl": 0.023529052734375, + "learning_rate": 1.8430034129692834e-06, + "loss": 0.0021, + "num_tokens": 12217262.0, + "reward": 2.1259765625, + "reward_std": 0.22819241881370544, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 657.33203125, + "completions/mean_terminated_length": 640.8419189453125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.009558760774942391, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2847110715388132, + "kl": 0.024322509765625, + "learning_rate": 1.8771331058020478e-06, + "loss": 0.0536, + "num_tokens": 12437219.0, + "reward": 2.1474609375, + "reward_std": 0.20018360018730164, + "rewards/accuracy_reward/mean": 0.17916665971279144, + "rewards/accuracy_reward/std": 0.38429322838783264, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 777.65234375, + "completions/mean_terminated_length": 767.6495971679688, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.00972945293163779, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.22213378435709752, + "kl": 0.024383544921875, + "learning_rate": 1.9112627986348124e-06, + "loss": 0.0451, + "num_tokens": 12678554.0, + "reward": 2.1181640625, + "reward_std": 0.15275543928146362, + "rewards/accuracy_reward/mean": 0.14166666567325592, + "rewards/accuracy_reward/std": 0.3494366705417633, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 718.05078125, + "completions/mean_terminated_length": 707.5787353515625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.00990014508833319, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.28191643931193966, + "kl": 0.020477294921875, + "learning_rate": 1.945392491467577e-06, + "loss": 0.0513, + "num_tokens": 12901015.0, + "reward": 2.1689453125, + "reward_std": 0.17753830552101135, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1740.0, + "completions/max_terminated_length": 1740.0, + "completions/mean_length": 813.0390625, + "completions/mean_terminated_length": 813.0390625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.01007083724502859, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.20004007293594697, + "kl": 0.019744873046875, + "learning_rate": 1.9795221843003416e-06, + "loss": 0.02, + "num_tokens": 13150625.0, + "reward": 2.2724609375, + "reward_std": 0.13990169763565063, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 651.23046875, + "completions/mean_terminated_length": 645.7529907226562, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.01024152940172399, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.230630231951018, + "kl": 0.0245361328125, + "learning_rate": 2.013651877133106e-06, + "loss": 0.0183, + "num_tokens": 13361468.0, + "reward": 2.2197265625, + "reward_std": 0.13253937661647797, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1643.0, + "completions/mean_length": 726.03515625, + "completions/mean_terminated_length": 720.8510131835938, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.01041222155841939, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20261858344063152, + "kl": 0.021148681640625, + "learning_rate": 2.0477815699658705e-06, + "loss": 0.0245, + "num_tokens": 13587525.0, + "reward": 2.220703125, + "reward_std": 0.21280798316001892, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.05828297883272171, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1896.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 729.28125, + "completions/mean_terminated_length": 729.28125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.01058291371511479, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.23135925709882793, + "kl": 0.023712158203125, + "learning_rate": 2.0819112627986347e-06, + "loss": -0.0035, + "num_tokens": 13818445.0, + "reward": 2.16796875, + "reward_std": 0.15128782391548157, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 687.8125, + "completions/mean_terminated_length": 682.4784545898438, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.010753605871810191, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20520538261174495, + "kl": 0.02166748046875, + "learning_rate": 2.1160409556313997e-06, + "loss": 0.0213, + "num_tokens": 14035309.0, + "reward": 2.1015625, + "reward_std": 0.1621098518371582, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1765.0, + "completions/mean_length": 699.51953125, + "completions/mean_terminated_length": 694.2313842773438, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.010924298028505591, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.30581510599832934, + "kl": 0.02569580078125, + "learning_rate": 2.150170648464164e-06, + "loss": 0.0151, + "num_tokens": 14256658.0, + "reward": 2.3173828125, + "reward_std": 0.2230260670185089, + "rewards/accuracy_reward/mean": 0.32421875, + "rewards/accuracy_reward/std": 0.46899911761283875, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1874.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 711.1171875, + "completions/mean_terminated_length": 711.1171875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.01109499018520099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.2587496642745615, + "kl": 0.0216064453125, + "learning_rate": 2.1843003412969285e-06, + "loss": 0.0053, + "num_tokens": 14485088.0, + "reward": 2.18359375, + "reward_std": 0.22744406759738922, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.387910932302475, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 821.25, + "completions/mean_terminated_length": 816.4392700195312, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.01126568234189639, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20588936049692244, + "kl": 0.022705078125, + "learning_rate": 2.218430034129693e-06, + "loss": 0.0171, + "num_tokens": 14739312.0, + "reward": 2.02734375, + "reward_std": 0.14297816157341003, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 678.2421875, + "completions/mean_terminated_length": 672.87060546875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.01143637449859179, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.11696192913844981, + "kl": 0.022247314453125, + "learning_rate": 2.2525597269624573e-06, + "loss": 0.0259, + "num_tokens": 14948078.0, + "reward": 2.0595703125, + "reward_std": 0.08008454740047455, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1876.0, + "completions/max_terminated_length": 1876.0, + "completions/mean_length": 694.7421875, + "completions/mean_terminated_length": 694.7421875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.01160706665528719, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.29416035036184035, + "kl": 0.02423095703125, + "learning_rate": 2.286689419795222e-06, + "loss": 0.0211, + "num_tokens": 15169692.0, + "reward": 2.265625, + "reward_std": 0.21166008710861206, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 721.58203125, + "completions/mean_terminated_length": 716.3804321289062, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.01177775881198259, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21043074063270742, + "kl": 0.0225830078125, + "learning_rate": 2.3208191126279866e-06, + "loss": 0.017, + "num_tokens": 15398305.0, + "reward": 2.296875, + "reward_std": 0.19236691296100616, + "rewards/accuracy_reward/mean": 0.30859375, + "rewards/accuracy_reward/std": 0.46281787753105164, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 814.22265625, + "completions/mean_terminated_length": 779.5381469726562, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.01194845096867799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.22660252900141578, + "kl": 0.02520751953125, + "learning_rate": 2.354948805460751e-06, + "loss": 0.0324, + "num_tokens": 15650506.0, + "reward": 2.134765625, + "reward_std": 0.26768726110458374, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17433346807956696, + "rewards/tag_count_reward/mean": 0.978515625, + "rewards/tag_count_reward/std": 0.12338106334209442, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 654.94140625, + "completions/mean_terminated_length": 649.4784545898438, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.01211914312537339, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.22593903715279096, + "kl": 0.02099609375, + "learning_rate": 2.3890784982935154e-06, + "loss": 0.0122, + "num_tokens": 15860987.0, + "reward": 2.1376953125, + "reward_std": 0.1641109138727188, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1573.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 642.3125, + "completions/mean_terminated_length": 642.3125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.01228983528206879, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2928231682979465, + "kl": 0.0263671875, + "learning_rate": 2.42320819112628e-06, + "loss": -0.0096, + "num_tokens": 16071931.0, + "reward": 2.109375, + "reward_std": 0.1330379694700241, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1767.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 697.09375, + "completions/mean_terminated_length": 697.09375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.012460527438764189, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.11610643939557473, + "kl": 0.021728515625, + "learning_rate": 2.4573378839590446e-06, + "loss": 0.0062, + "num_tokens": 16291395.0, + "reward": 2.0625, + "reward_std": 0.06404343992471695, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 753.29296875, + "completions/mean_terminated_length": 743.0984497070312, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.012631219595459589, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.26022262666646606, + "kl": 0.01910400390625, + "learning_rate": 2.491467576791809e-06, + "loss": 0.0239, + "num_tokens": 16523806.0, + "reward": 2.083984375, + "reward_std": 0.22650974988937378, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0539139099419117, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1618.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 651.87109375, + "completions/mean_terminated_length": 651.87109375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.012801911752154988, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.17039938336289753, + "kl": 0.028350830078125, + "learning_rate": 2.5255972696245735e-06, + "loss": 0.001, + "num_tokens": 16737469.0, + "reward": 2.0625, + "reward_std": 0.055901698768138885, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 630.91015625, + "completions/mean_terminated_length": 625.3529663085938, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.012972603908850388, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.35548020215914033, + "kl": 0.0255126953125, + "learning_rate": 2.559726962457338e-06, + "loss": 0.0189, + "num_tokens": 16938550.0, + "reward": 2.1435546875, + "reward_std": 0.1781865656375885, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.056234680116176605, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 666.74609375, + "completions/mean_terminated_length": 666.74609375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.013143296065545788, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.26971805627090895, + "kl": 0.025726318359375, + "learning_rate": 2.5938566552901023e-06, + "loss": -0.0107, + "num_tokens": 17151397.0, + "reward": 2.2734375, + "reward_std": 0.1969657838344574, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 669.44140625, + "completions/mean_terminated_length": 669.44140625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.013313988222241188, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.34720947271536523, + "kl": 0.02764892578125, + "learning_rate": 2.6279863481228673e-06, + "loss": -0.0125, + "num_tokens": 17364150.0, + "reward": 2.1640625, + "reward_std": 0.1849614679813385, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 738.13671875, + "completions/mean_terminated_length": 733.0000610351562, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.013484680378936588, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.22627825897852072, + "kl": 0.02178955078125, + "learning_rate": 2.662116040955632e-06, + "loss": 0.0238, + "num_tokens": 17596489.0, + "reward": 2.0947265625, + "reward_std": 0.17080409824848175, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 647.84375, + "completions/mean_terminated_length": 642.3529663085938, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.013655372535631987, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2664885540945657, + "kl": 0.029266357421875, + "learning_rate": 2.696245733788396e-06, + "loss": 0.0089, + "num_tokens": 17805185.0, + "reward": 2.1142578125, + "reward_std": 0.187485933303833, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3268752694129944, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 627.50390625, + "completions/mean_terminated_length": 616.3189086914062, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.013826064692327387, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.26361715825847487, + "kl": 0.03411865234375, + "learning_rate": 2.7303754266211608e-06, + "loss": 0.0401, + "num_tokens": 18006306.0, + "reward": 2.115234375, + "reward_std": 0.16322284936904907, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 668.59765625, + "completions/mean_terminated_length": 668.59765625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.013996756849022787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.23477737951535552, + "kl": 0.02520751953125, + "learning_rate": 2.7645051194539254e-06, + "loss": -0.0137, + "num_tokens": 18213483.0, + "reward": 2.1396484375, + "reward_std": 0.18578988313674927, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 558.875, + "completions/mean_terminated_length": 558.875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.014167449005718187, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3891077648280843, + "kl": 0.03350830078125, + "learning_rate": 2.7986348122866896e-06, + "loss": 0.0101, + "num_tokens": 18404379.0, + "reward": 2.275390625, + "reward_std": 0.24874238669872284, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.06976743042469025, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1350.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 622.171875, + "completions/mean_terminated_length": 622.171875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.014338141162413586, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3456656022820857, + "kl": 0.029754638671875, + "learning_rate": 2.832764505119454e-06, + "loss": 0.001, + "num_tokens": 18606823.0, + "reward": 2.25390625, + "reward_std": 0.26223528385162354, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.4382871091365814, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1774.0, + "completions/max_terminated_length": 1774.0, + "completions/mean_length": 759.0859375, + "completions/mean_terminated_length": 759.0859375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.014508833319108986, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19701181646775417, + "kl": 0.023468017578125, + "learning_rate": 2.8668941979522184e-06, + "loss": -0.0033, + "num_tokens": 18838525.0, + "reward": 2.23046875, + "reward_std": 0.10386522114276886, + "rewards/accuracy_reward/mean": 0.23046875, + "rewards/accuracy_reward/std": 0.4219578504562378, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 804.83984375, + "completions/mean_terminated_length": 790.0988159179688, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.014679525475804386, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23736512903924814, + "kl": 0.0235595703125, + "learning_rate": 2.901023890784983e-06, + "loss": 0.0479, + "num_tokens": 19086420.0, + "reward": 2.060546875, + "reward_std": 0.18541032075881958, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1506.0, + "completions/max_terminated_length": 1506.0, + "completions/mean_length": 728.10546875, + "completions/mean_terminated_length": 728.10546875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.014850217632499786, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.27360678913963005, + "kl": 0.021148681640625, + "learning_rate": 2.9351535836177476e-06, + "loss": 0.0094, + "num_tokens": 19310943.0, + "reward": 2.2109375, + "reward_std": 0.208065003156662, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 678.59765625, + "completions/mean_terminated_length": 667.81494140625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.015020909789195187, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.29471342374978915, + "kl": 0.02398681640625, + "learning_rate": 2.969283276450512e-06, + "loss": 0.0315, + "num_tokens": 19522376.0, + "reward": 2.4609375, + "reward_std": 0.29191040992736816, + "rewards/accuracy_reward/mean": 0.47265625, + "rewards/accuracy_reward/std": 0.5002297759056091, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 636.125, + "completions/mean_terminated_length": 636.125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.015191601945890587, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19365945361892892, + "kl": 0.025604248046875, + "learning_rate": 3.003412969283277e-06, + "loss": 0.008, + "num_tokens": 19735128.0, + "reward": 2.1201171875, + "reward_std": 0.10492250323295593, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1907.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 726.671875, + "completions/mean_terminated_length": 726.671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.015362294102585987, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.223789868397834, + "kl": 0.02410888671875, + "learning_rate": 3.0375426621160415e-06, + "loss": 0.0207, + "num_tokens": 19958644.0, + "reward": 2.126953125, + "reward_std": 0.10695268213748932, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 701.0703125, + "completions/mean_terminated_length": 695.7882690429688, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.015532986259281387, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2352647347092609, + "kl": 0.02484130859375, + "learning_rate": 3.0716723549488057e-06, + "loss": 0.0299, + "num_tokens": 20179782.0, + "reward": 2.1806640625, + "reward_std": 0.22329594194889069, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1805.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 684.84765625, + "completions/mean_terminated_length": 684.84765625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.015703678415976786, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.16125001887091622, + "kl": 0.02569580078125, + "learning_rate": 3.1058020477815703e-06, + "loss": 0.0095, + "num_tokens": 20393519.0, + "reward": 2.1953125, + "reward_std": 0.091475710272789, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1767.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 731.48828125, + "completions/mean_terminated_length": 731.48828125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.015874370572672186, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2609837047395921, + "kl": 0.025054931640625, + "learning_rate": 3.139931740614335e-06, + "loss": 0.0133, + "num_tokens": 20620588.0, + "reward": 2.08984375, + "reward_std": 0.13204212486743927, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1823.0, + "completions/max_terminated_length": 1823.0, + "completions/mean_length": 734.171875, + "completions/mean_terminated_length": 734.171875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.016045062729367586, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.22000767571404442, + "kl": 0.02618408203125, + "learning_rate": 3.174061433447099e-06, + "loss": 0.0038, + "num_tokens": 20850184.0, + "reward": 2.1552734375, + "reward_std": 0.13501232862472534, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1636.0, + "completions/mean_length": 792.35546875, + "completions/mean_terminated_length": 767.3426513671875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.016215754886062986, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.1939894463277693, + "kl": 0.02508544921875, + "learning_rate": 3.2081911262798638e-06, + "loss": 0.0608, + "num_tokens": 21094627.0, + "reward": 2.0166015625, + "reward_std": 0.1900005340576172, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.1039903536438942, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1480.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 746.8203125, + "completions/mean_terminated_length": 746.8203125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.016386447042758386, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.11313950660524232, + "kl": 0.024200439453125, + "learning_rate": 3.2423208191126284e-06, + "loss": -0.0068, + "num_tokens": 21330549.0, + "reward": 2.06640625, + "reward_std": 0.015625, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 797.12109375, + "completions/mean_terminated_length": 767.1000366210938, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.016557139199453785, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.14104742023767758, + "kl": 0.02191162109375, + "learning_rate": 3.2764505119453926e-06, + "loss": 0.0365, + "num_tokens": 21574900.0, + "reward": 2.154296875, + "reward_std": 0.16763019561767578, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.978515625, + "rewards/tag_count_reward/std": 0.11934192478656769, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 796.32421875, + "completions/mean_terminated_length": 791.4157104492188, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.016727831356149185, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14512540519898048, + "kl": 0.0230712890625, + "learning_rate": 3.310580204778157e-06, + "loss": 0.0238, + "num_tokens": 21817495.0, + "reward": 2.044921875, + "reward_std": 0.10014503449201584, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1942.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 661.23828125, + "completions/mean_terminated_length": 661.23828125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.016898523512844585, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.27051432068634074, + "kl": 0.028778076171875, + "learning_rate": 3.3447098976109214e-06, + "loss": 0.007, + "num_tokens": 22024644.0, + "reward": 2.0966796875, + "reward_std": 0.10337906330823898, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 710.828125, + "completions/mean_terminated_length": 678.7360229492188, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.017069215669539985, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2341136272257988, + "kl": 0.027984619140625, + "learning_rate": 3.378839590443686e-06, + "loss": 0.028, + "num_tokens": 22242440.0, + "reward": 2.283203125, + "reward_std": 0.26649540662765503, + "rewards/accuracy_reward/mean": 0.32421875, + "rewards/accuracy_reward/std": 0.46899911761283875, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.982421875, + "rewards/tag_count_reward/std": 0.11368858069181442, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 668.94140625, + "completions/mean_terminated_length": 668.94140625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.017239907826235384, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2221903030857499, + "kl": 0.027252197265625, + "learning_rate": 3.412969283276451e-06, + "loss": -0.0075, + "num_tokens": 22456505.0, + "reward": 2.29296875, + "reward_std": 0.14109118282794952, + "rewards/accuracy_reward/mean": 0.29296875, + "rewards/accuracy_reward/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1484.0, + "completions/max_terminated_length": 1484.0, + "completions/mean_length": 748.9140625, + "completions/mean_terminated_length": 748.9140625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.017410599982930784, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.17720936525088388, + "kl": 0.025177001953125, + "learning_rate": 3.4470989761092157e-06, + "loss": 0.0053, + "num_tokens": 22690275.0, + "reward": 2.09375, + "reward_std": 0.08054865896701813, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 686.57421875, + "completions/mean_terminated_length": 681.2353515625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.017581292139626184, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.25473000616383845, + "kl": 0.027313232421875, + "learning_rate": 3.48122866894198e-06, + "loss": 0.0073, + "num_tokens": 22908326.0, + "reward": 2.0322265625, + "reward_std": 0.12548616528511047, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 777.83203125, + "completions/mean_terminated_length": 762.770751953125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.017751984296321584, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.26720016201584257, + "kl": 0.023468017578125, + "learning_rate": 3.5153583617747445e-06, + "loss": 0.0342, + "num_tokens": 23149115.0, + "reward": 2.1591796875, + "reward_std": 0.2461339682340622, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 835.8359375, + "completions/mean_terminated_length": 821.4624633789062, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.017922676453016984, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.18310904371355669, + "kl": 0.023834228515625, + "learning_rate": 3.5494880546075087e-06, + "loss": 0.0361, + "num_tokens": 23404881.0, + "reward": 2.1162109375, + "reward_std": 0.19914110004901886, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1654.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 604.71484375, + "completions/mean_terminated_length": 604.71484375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.018093368609712383, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2564181712284548, + "kl": 0.029266357421875, + "learning_rate": 3.5836177474402733e-06, + "loss": -0.0045, + "num_tokens": 23597688.0, + "reward": 2.20703125, + "reward_std": 0.16398435831069946, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1452.0, + "completions/max_terminated_length": 1452.0, + "completions/mean_length": 703.80859375, + "completions/mean_terminated_length": 703.80859375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.018264060766407783, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.23166905814053096, + "kl": 0.027313232421875, + "learning_rate": 3.617747440273038e-06, + "loss": 0.011, + "num_tokens": 23820551.0, + "reward": 2.2421875, + "reward_std": 0.17153127491474152, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1618.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 574.64453125, + "completions/mean_terminated_length": 574.64453125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.018434752923103183, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2855733991434547, + "kl": 0.028961181640625, + "learning_rate": 3.651877133105802e-06, + "loss": 0.0028, + "num_tokens": 24013612.0, + "reward": 2.076171875, + "reward_std": 0.11961116641759872, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 719.453125, + "completions/mean_terminated_length": 719.453125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.018605445079798583, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1850345922059434, + "kl": 0.023193359375, + "learning_rate": 3.6860068259385667e-06, + "loss": 0.0067, + "num_tokens": 24236768.0, + "reward": 2.1640625, + "reward_std": 0.16458465158939362, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 783.3984375, + "completions/mean_terminated_length": 778.4392700195312, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.018776137236493982, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2035334407576515, + "kl": 0.023529052734375, + "learning_rate": 3.7201365187713314e-06, + "loss": 0.0071, + "num_tokens": 24476838.0, + "reward": 2.1259765625, + "reward_std": 0.19771617650985718, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1371.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 648.4609375, + "completions/mean_terminated_length": 648.4609375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.018946829393189382, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.325752415550003, + "kl": 0.02850341796875, + "learning_rate": 3.7542662116040956e-06, + "loss": 0.0082, + "num_tokens": 24682748.0, + "reward": 2.2109375, + "reward_std": 0.2503623962402344, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1523.0, + "completions/max_terminated_length": 1523.0, + "completions/mean_length": 612.68359375, + "completions/mean_terminated_length": 612.68359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.019117521549884782, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2859806530036638, + "kl": 0.026611328125, + "learning_rate": 3.78839590443686e-06, + "loss": 0.0052, + "num_tokens": 24882523.0, + "reward": 2.158203125, + "reward_std": 0.18802213668823242, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1429.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 744.6953125, + "completions/mean_terminated_length": 744.6953125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.019288213706580182, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15711373541520818, + "kl": 0.024749755859375, + "learning_rate": 3.822525597269625e-06, + "loss": 0.0034, + "num_tokens": 25115821.0, + "reward": 2.125, + "reward_std": 0.11123998463153839, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 729.9296875, + "completions/mean_terminated_length": 729.9296875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.01945890586327558, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.22765920980183085, + "kl": 0.02825927734375, + "learning_rate": 3.8566552901023894e-06, + "loss": -0.0076, + "num_tokens": 25343579.0, + "reward": 2.1171875, + "reward_std": 0.1624118983745575, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1565.0, + "completions/max_terminated_length": 1565.0, + "completions/mean_length": 672.57421875, + "completions/mean_terminated_length": 672.57421875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.01962959801997098, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2789722547005921, + "kl": 0.02899169921875, + "learning_rate": 3.890784982935154e-06, + "loss": -0.0193, + "num_tokens": 25568670.0, + "reward": 2.19921875, + "reward_std": 0.18396370112895966, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1769.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 781.41015625, + "completions/mean_terminated_length": 781.41015625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.01980029017666638, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.27649106466611606, + "kl": 0.023345947265625, + "learning_rate": 3.924914675767919e-06, + "loss": -0.0068, + "num_tokens": 25813767.0, + "reward": 2.123046875, + "reward_std": 0.19247952103614807, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 765.12890625, + "completions/mean_terminated_length": 760.0980834960938, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.01997098233336178, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21457372079127252, + "kl": 0.026947021484375, + "learning_rate": 3.959044368600683e-06, + "loss": 0.0275, + "num_tokens": 26046776.0, + "reward": 2.1884765625, + "reward_std": 0.2054077386856079, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1827.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 758.01953125, + "completions/mean_terminated_length": 758.01953125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.02014167449005718, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.24014106563483797, + "kl": 0.025665283203125, + "learning_rate": 3.993174061433447e-06, + "loss": 0.0129, + "num_tokens": 26277757.0, + "reward": 2.2880859375, + "reward_std": 0.2407703995704651, + "rewards/accuracy_reward/mean": 0.29296875, + "rewards/accuracy_reward/std": 0.45601576566696167, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1880.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 743.8984375, + "completions/mean_terminated_length": 743.8984375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.02031236664675258, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20620036830918545, + "kl": 0.02435302734375, + "learning_rate": 4.027303754266212e-06, + "loss": 0.0043, + "num_tokens": 26511203.0, + "reward": 2.203125, + "reward_std": 0.1504075974225998, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40311288833618164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1455.0, + "completions/mean_length": 718.80859375, + "completions/mean_terminated_length": 713.5961303710938, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.02048305880344798, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.25486492229035845, + "kl": 0.0281982421875, + "learning_rate": 4.061433447098976e-06, + "loss": 0.0087, + "num_tokens": 26741266.0, + "reward": 2.095703125, + "reward_std": 0.21390876173973083, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1602.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 731.08984375, + "completions/mean_terminated_length": 731.08984375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.02065375096014338, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2387478799095891, + "kl": 0.027618408203125, + "learning_rate": 4.095563139931741e-06, + "loss": 0.0159, + "num_tokens": 26969609.0, + "reward": 2.171875, + "reward_std": 0.14459210634231567, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1633.0, + "completions/max_terminated_length": 1633.0, + "completions/mean_length": 804.0859375, + "completions/mean_terminated_length": 804.0859375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.02082444311683878, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.22488187173229768, + "kl": 0.02423095703125, + "learning_rate": 4.1296928327645055e-06, + "loss": -0.0008, + "num_tokens": 27214031.0, + "reward": 2.2578125, + "reward_std": 0.15119513869285583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 632.63671875, + "completions/mean_terminated_length": 632.63671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.02099513527353418, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.22078832489304992, + "kl": 0.03289794921875, + "learning_rate": 4.163822525597269e-06, + "loss": 0.0055, + "num_tokens": 27416322.0, + "reward": 2.234375, + "reward_std": 0.15131491422653198, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 574.80859375, + "completions/mean_terminated_length": 574.80859375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.02116582743022958, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.24323545789598744, + "kl": 0.030731201171875, + "learning_rate": 4.197952218430034e-06, + "loss": 0.0045, + "num_tokens": 27602689.0, + "reward": 2.2734375, + "reward_std": 0.15314997732639313, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1414.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 607.5703125, + "completions/mean_terminated_length": 607.5703125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.02133651958692498, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.23891361392370583, + "kl": 0.0340576171875, + "learning_rate": 4.232081911262799e-06, + "loss": 0.0059, + "num_tokens": 27801539.0, + "reward": 2.2421875, + "reward_std": 0.17922285199165344, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 671.26953125, + "completions/mean_terminated_length": 671.26953125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.021507211743620382, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.17192368218953094, + "kl": 0.031585693359375, + "learning_rate": 4.266211604095564e-06, + "loss": 0.0123, + "num_tokens": 28018760.0, + "reward": 2.14453125, + "reward_std": 0.1347825974225998, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1361.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 630.546875, + "completions/mean_terminated_length": 630.546875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.021677903900315782, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.23203232318064715, + "kl": 0.031005859375, + "learning_rate": 4.300341296928328e-06, + "loss": 0.0133, + "num_tokens": 28225812.0, + "reward": 2.16015625, + "reward_std": 0.13763156533241272, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1368.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 656.29296875, + "completions/mean_terminated_length": 656.29296875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.021848596057011182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.22456507329534872, + "kl": 0.0338134765625, + "learning_rate": 4.3344709897610924e-06, + "loss": 0.011, + "num_tokens": 28432751.0, + "reward": 2.3427734375, + "reward_std": 0.1778957098722458, + "rewards/accuracy_reward/mean": 0.34765625, + "rewards/accuracy_reward/std": 0.4771590530872345, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 573.3671875, + "completions/mean_terminated_length": 573.3671875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.02201928821370658, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2446789888156702, + "kl": 0.03607177734375, + "learning_rate": 4.368600682593857e-06, + "loss": 0.0279, + "num_tokens": 28621261.0, + "reward": 2.30859375, + "reward_std": 0.1446847766637802, + "rewards/accuracy_reward/mean": 0.30859375, + "rewards/accuracy_reward/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1522.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 651.88671875, + "completions/mean_terminated_length": 651.88671875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.02218998037040198, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.15215184185417163, + "kl": 0.034423828125, + "learning_rate": 4.402730375426622e-06, + "loss": 0.0074, + "num_tokens": 28832176.0, + "reward": 2.27734375, + "reward_std": 0.06327171623706818, + "rewards/accuracy_reward/mean": 0.27734375, + "rewards/accuracy_reward/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1948.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 685.1015625, + "completions/mean_terminated_length": 685.1015625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.02236067252709738, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21245455821307332, + "kl": 0.03228759765625, + "learning_rate": 4.436860068259386e-06, + "loss": -0.0041, + "num_tokens": 29050954.0, + "reward": 2.1259765625, + "reward_std": 0.16015759110450745, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1596.0, + "completions/max_terminated_length": 1596.0, + "completions/mean_length": 660.875, + "completions/mean_terminated_length": 660.875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.02253136468379278, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08753631611092626, + "kl": 0.033203125, + "learning_rate": 4.47098976109215e-06, + "loss": 0.0021, + "num_tokens": 29259722.0, + "reward": 2.0546875, + "reward_std": 0.021347813308238983, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 679.33984375, + "completions/mean_terminated_length": 679.33984375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.02270205684048818, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.18945223359706162, + "kl": 0.03515625, + "learning_rate": 4.505119453924915e-06, + "loss": 0.0022, + "num_tokens": 29469201.0, + "reward": 2.09765625, + "reward_std": 0.13204212486743927, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 685.640625, + "completions/mean_terminated_length": 685.640625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.02287274899718358, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2814633542453643, + "kl": 0.03515625, + "learning_rate": 4.539249146757679e-06, + "loss": -0.0045, + "num_tokens": 29683253.0, + "reward": 2.2109375, + "reward_std": 0.19667133688926697, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1482.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 605.76953125, + "completions/mean_terminated_length": 605.76953125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.02304344115387898, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2747882393512727, + "kl": 0.033203125, + "learning_rate": 4.573378839590444e-06, + "loss": 0.0036, + "num_tokens": 29876890.0, + "reward": 2.16015625, + "reward_std": 0.18032719194889069, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1350.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 686.69921875, + "completions/mean_terminated_length": 686.69921875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.02321413331057438, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3095202354914591, + "kl": 0.038330078125, + "learning_rate": 4.6075085324232085e-06, + "loss": -0.0039, + "num_tokens": 30097053.0, + "reward": 2.2646484375, + "reward_std": 0.2132222056388855, + "rewards/accuracy_reward/mean": 0.26953125, + "rewards/accuracy_reward/std": 0.44458550214767456, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1730.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 716.85546875, + "completions/mean_terminated_length": 716.85546875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.02338482546726978, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.11612389634492264, + "kl": 0.03704833984375, + "learning_rate": 4.641638225255973e-06, + "loss": -0.0014, + "num_tokens": 30321784.0, + "reward": 2.0078125, + "reward_std": 0.021347813308238983, + "rewards/accuracy_reward/mean": 0.008333333767950535, + "rewards/accuracy_reward/std": 0.09109591692686081, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 757.55859375, + "completions/mean_terminated_length": 752.4981079101562, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.02355551762396518, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1630892809845905, + "kl": 0.03216552734375, + "learning_rate": 4.675767918088738e-06, + "loss": 0.0144, + "num_tokens": 30564551.0, + "reward": 2.138671875, + "reward_std": 0.1100650280714035, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1933.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 709.50390625, + "completions/mean_terminated_length": 709.50390625, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.02372620978066058, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.16537876126416973, + "kl": 0.030853271484375, + "learning_rate": 4.709897610921502e-06, + "loss": 0.0087, + "num_tokens": 30786456.0, + "reward": 2.19140625, + "reward_std": 0.07914985716342926, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.39417871832847595, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 793.19921875, + "completions/mean_terminated_length": 783.3189086914062, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.02389690193735598, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.19620324621456942, + "kl": 0.02740478515625, + "learning_rate": 4.744027303754267e-06, + "loss": 0.0365, + "num_tokens": 31033515.0, + "reward": 2.216796875, + "reward_std": 0.18567997217178345, + "rewards/accuracy_reward/mean": 0.23046875, + "rewards/accuracy_reward/std": 0.4219578504562378, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 724.0625, + "completions/mean_terminated_length": 718.87060546875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.02406759409405138, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.17003896897050963, + "kl": 0.03277587890625, + "learning_rate": 4.778156996587031e-06, + "loss": 0.0119, + "num_tokens": 31259355.0, + "reward": 2.1650390625, + "reward_std": 0.1242891401052475, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 733.91015625, + "completions/mean_terminated_length": 733.91015625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.02423828625074678, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19968655711249722, + "kl": 0.03411865234375, + "learning_rate": 4.812286689419795e-06, + "loss": 0.0031, + "num_tokens": 31488516.0, + "reward": 2.0849609375, + "reward_std": 0.0807008445262909, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1247.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 636.87109375, + "completions/mean_terminated_length": 636.87109375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.02440897840744218, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19569887994448035, + "kl": 0.0394287109375, + "learning_rate": 4.84641638225256e-06, + "loss": -0.0064, + "num_tokens": 31691027.0, + "reward": 2.1123046875, + "reward_std": 0.10998210310935974, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1459.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 652.08203125, + "completions/mean_terminated_length": 652.08203125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.02457967056413758, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.24886719120732376, + "kl": 0.03399658203125, + "learning_rate": 4.880546075085325e-06, + "loss": -0.0003, + "num_tokens": 31910248.0, + "reward": 2.24609375, + "reward_std": 0.19368696212768555, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.4382871091365814, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 655.3203125, + "completions/mean_terminated_length": 649.85888671875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.024750362720832978, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.23075036509137992, + "kl": 0.03839111328125, + "learning_rate": 4.914675767918089e-06, + "loss": 0.023, + "num_tokens": 32130442.0, + "reward": 2.1142578125, + "reward_std": 0.17910391092300415, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 668.4375, + "completions/mean_terminated_length": 663.0274658203125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.024921054877528378, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.26509246533339553, + "kl": 0.0379638671875, + "learning_rate": 4.948805460750853e-06, + "loss": 0.0279, + "num_tokens": 32341114.0, + "reward": 2.1162109375, + "reward_std": 0.17143003642559052, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.05169277638196945, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 601.3828125, + "completions/mean_terminated_length": 595.7098388671875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.025091747034223778, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.26893914279949027, + "kl": 0.04168701171875, + "learning_rate": 4.982935153583618e-06, + "loss": 0.0296, + "num_tokens": 32536172.0, + "reward": 2.2158203125, + "reward_std": 0.1604604721069336, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 575.703125, + "completions/mean_terminated_length": 575.703125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.025262439190919177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17296758732765172, + "kl": 0.03973388671875, + "learning_rate": 5.017064846416383e-06, + "loss": -0.001, + "num_tokens": 32721952.0, + "reward": 2.234375, + "reward_std": 0.11904004216194153, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 704.421875, + "completions/mean_terminated_length": 704.421875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.025433131347614577, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.138289997186277, + "kl": 0.03759765625, + "learning_rate": 5.051194539249147e-06, + "loss": 0.0053, + "num_tokens": 32940652.0, + "reward": 2.09765625, + "reward_std": 0.07779236882925034, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1540.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 691.63671875, + "completions/mean_terminated_length": 691.63671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.025603823504309977, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.28861937345298144, + "kl": 0.0401611328125, + "learning_rate": 5.0853242320819115e-06, + "loss": 0.0128, + "num_tokens": 33164975.0, + "reward": 2.12890625, + "reward_std": 0.16021710634231567, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1281.0, + "completions/max_terminated_length": 1281.0, + "completions/mean_length": 649.96875, + "completions/mean_terminated_length": 649.96875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.025774515661005377, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2365589433392706, + "kl": 0.04443359375, + "learning_rate": 5.119453924914676e-06, + "loss": 0.0104, + "num_tokens": 33371367.0, + "reward": 2.0966796875, + "reward_std": 0.09512881934642792, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1705.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 669.22265625, + "completions/mean_terminated_length": 669.22265625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.025945207817700777, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21775647552085042, + "kl": 0.035400390625, + "learning_rate": 5.153583617747441e-06, + "loss": 0.0182, + "num_tokens": 33578000.0, + "reward": 2.26171875, + "reward_std": 0.1708115190267563, + "rewards/accuracy_reward/mean": 0.26171875, + "rewards/accuracy_reward/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1784.0, + "completions/max_terminated_length": 1784.0, + "completions/mean_length": 737.78125, + "completions/mean_terminated_length": 737.78125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.026115899974396176, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.24242640382401687, + "kl": 0.0401611328125, + "learning_rate": 5.1877133105802046e-06, + "loss": 0.0018, + "num_tokens": 33808904.0, + "reward": 2.1875, + "reward_std": 0.1355944126844406, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 736.69921875, + "completions/mean_terminated_length": 731.556884765625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.026286592131091576, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2553270948496218, + "kl": 0.0416259765625, + "learning_rate": 5.22184300341297e-06, + "loss": 0.0171, + "num_tokens": 34038603.0, + "reward": 2.1416015625, + "reward_std": 0.12987858057022095, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 661.0703125, + "completions/mean_terminated_length": 661.0703125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.026457284287786976, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.20318442100463965, + "kl": 0.0306396484375, + "learning_rate": 5.255972696245735e-06, + "loss": 0.0032, + "num_tokens": 34261517.0, + "reward": 2.140625, + "reward_std": 0.07064647972583771, + "rewards/accuracy_reward/mean": 0.15000000596046448, + "rewards/accuracy_reward/std": 0.3578176498413086, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 709.23046875, + "completions/mean_terminated_length": 703.9804077148438, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.026627976444482376, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2964528887159929, + "kl": 0.04345703125, + "learning_rate": 5.290102389078498e-06, + "loss": 0.0298, + "num_tokens": 34485768.0, + "reward": 2.23046875, + "reward_std": 0.2274703085422516, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4292463958263397, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 714.078125, + "completions/mean_terminated_length": 703.5748291015625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.026798668601177775, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.25932845764800405, + "kl": 0.04400634765625, + "learning_rate": 5.324232081911264e-06, + "loss": 0.0301, + "num_tokens": 34714284.0, + "reward": 2.064453125, + "reward_std": 0.17609459161758423, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 728.95703125, + "completions/mean_terminated_length": 713.3162231445312, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.026969360757873175, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24491696723125306, + "kl": 0.0439453125, + "learning_rate": 5.358361774744028e-06, + "loss": 0.0245, + "num_tokens": 34941441.0, + "reward": 2.1318359375, + "reward_std": 0.22273758053779602, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 624.88671875, + "completions/mean_terminated_length": 619.305908203125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.027140052914568575, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14459693949329677, + "kl": 0.04315185546875, + "learning_rate": 5.392491467576792e-06, + "loss": 0.0195, + "num_tokens": 35146276.0, + "reward": 2.0107421875, + "reward_std": 0.07873210310935974, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 701.65234375, + "completions/mean_terminated_length": 701.65234375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.027310745071263975, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2109779137088134, + "kl": 0.04107666015625, + "learning_rate": 5.426621160409556e-06, + "loss": 0.0171, + "num_tokens": 35374411.0, + "reward": 2.1982421875, + "reward_std": 0.15782025456428528, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1614.0, + "completions/max_terminated_length": 1614.0, + "completions/mean_length": 733.21484375, + "completions/mean_terminated_length": 733.21484375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.027481437227959375, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1587229495986913, + "kl": 0.03460693359375, + "learning_rate": 5.4607508532423215e-06, + "loss": -0.0081, + "num_tokens": 35603586.0, + "reward": 2.234375, + "reward_std": 0.11476518213748932, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 662.81640625, + "completions/mean_terminated_length": 662.81640625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.027652129384654774, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2472462228355038, + "kl": 0.0406494140625, + "learning_rate": 5.494880546075085e-06, + "loss": 0.0072, + "num_tokens": 35809267.0, + "reward": 2.326171875, + "reward_std": 0.24120871722698212, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1687.0, + "completions/mean_length": 666.09765625, + "completions/mean_terminated_length": 660.678466796875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.027822821541350174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2515103619542943, + "kl": 0.04400634765625, + "learning_rate": 5.529010238907851e-06, + "loss": 0.0273, + "num_tokens": 36022636.0, + "reward": 2.2265625, + "reward_std": 0.21852636337280273, + "rewards/accuracy_reward/mean": 0.23828125, + "rewards/accuracy_reward/std": 0.4268665909767151, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 716.28515625, + "completions/mean_terminated_length": 716.28515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.027993513698045574, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.21623552533780424, + "kl": 0.04119873046875, + "learning_rate": 5.5631399317406145e-06, + "loss": 0.0101, + "num_tokens": 36248197.0, + "reward": 2.22265625, + "reward_std": 0.12003781646490097, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1378.0, + "completions/mean_length": 654.828125, + "completions/mean_terminated_length": 649.36474609375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.028164205854740974, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.14195513477320357, + "kl": 0.0458984375, + "learning_rate": 5.597269624573379e-06, + "loss": 0.023, + "num_tokens": 36459577.0, + "reward": 2.0400390625, + "reward_std": 0.09585705399513245, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1566.0, + "completions/max_terminated_length": 1566.0, + "completions/mean_length": 747.921875, + "completions/mean_terminated_length": 747.921875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.028334898011436373, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.3217091099095068, + "kl": 0.042724609375, + "learning_rate": 5.631399317406145e-06, + "loss": -0.0067, + "num_tokens": 36687077.0, + "reward": 2.1669921875, + "reward_std": 0.18445949256420135, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 802.828125, + "completions/mean_terminated_length": 793.0236206054688, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.028505590168131773, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.18406107449335785, + "kl": 0.0360107421875, + "learning_rate": 5.665529010238908e-06, + "loss": 0.0103, + "num_tokens": 36928393.0, + "reward": 2.107421875, + "reward_std": 0.1745760142803192, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3268752694129944, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1316.0, + "completions/max_terminated_length": 1316.0, + "completions/mean_length": 673.14453125, + "completions/mean_terminated_length": 673.14453125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.028676282324827173, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2330163218902185, + "kl": 0.04541015625, + "learning_rate": 5.699658703071673e-06, + "loss": -0.0004, + "num_tokens": 37145198.0, + "reward": 2.14453125, + "reward_std": 0.15380415320396423, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1935.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 706.578125, + "completions/mean_terminated_length": 706.578125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.028846974481522573, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.22365971144071226, + "kl": 0.046630859375, + "learning_rate": 5.733788395904437e-06, + "loss": 0.0035, + "num_tokens": 37365698.0, + "reward": 2.22265625, + "reward_std": 0.1679086685180664, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1738.0, + "completions/mean_length": 786.51171875, + "completions/mean_terminated_length": 776.5787353515625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.029017666638217973, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19591717671415113, + "kl": 0.042236328125, + "learning_rate": 5.767918088737202e-06, + "loss": 0.006, + "num_tokens": 37608901.0, + "reward": 2.095703125, + "reward_std": 0.14489173889160156, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 746.9375, + "completions/mean_terminated_length": 746.9375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.029188358794913372, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.22117108397348081, + "kl": 0.0479736328125, + "learning_rate": 5.802047781569966e-06, + "loss": 0.0267, + "num_tokens": 37845317.0, + "reward": 2.2177734375, + "reward_std": 0.10204866528511047, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 773.4375, + "completions/mean_terminated_length": 773.4375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.029359050951608772, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1704678222506254, + "kl": 0.049072265625, + "learning_rate": 5.8361774744027315e-06, + "loss": 0.0015, + "num_tokens": 38088981.0, + "reward": 2.0546875, + "reward_std": 0.08384781330823898, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1490.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 718.25, + "completions/mean_terminated_length": 718.25, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.029529743108304172, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.17782695298973797, + "kl": 0.0543212890625, + "learning_rate": 5.870307167235495e-06, + "loss": 0.0144, + "num_tokens": 38315765.0, + "reward": 2.1796875, + "reward_std": 0.12609326839447021, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 659.7421875, + "completions/mean_terminated_length": 659.7421875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.02970043526499957, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2809915947705423, + "kl": 0.05596923828125, + "learning_rate": 5.90443686006826e-06, + "loss": 0.015, + "num_tokens": 38529251.0, + "reward": 2.19921875, + "reward_std": 0.17350001633167267, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1459.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 713.9375, + "completions/mean_terminated_length": 713.9375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.02987112742169497, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.30901203610816175, + "kl": 0.05657958984375, + "learning_rate": 5.938566552901024e-06, + "loss": 0.026, + "num_tokens": 38769571.0, + "reward": 2.1826171875, + "reward_std": 0.2301645427942276, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1433.0, + "completions/max_terminated_length": 1433.0, + "completions/mean_length": 772.64453125, + "completions/mean_terminated_length": 772.64453125, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.030041819578390375, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17663554496968503, + "kl": 0.04327392578125, + "learning_rate": 5.972696245733789e-06, + "loss": 0.0035, + "num_tokens": 39011048.0, + "reward": 2.15625, + "reward_std": 0.09011821448802948, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1796.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 754.03125, + "completions/mean_terminated_length": 754.03125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.030212511735085774, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.219056821718453, + "kl": 0.03729248046875, + "learning_rate": 6.006825938566554e-06, + "loss": 0.004, + "num_tokens": 39244976.0, + "reward": 2.22265625, + "reward_std": 0.19203896820545197, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 760.37890625, + "completions/mean_terminated_length": 760.37890625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.030383203891781174, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20240856477596889, + "kl": 0.04010009765625, + "learning_rate": 6.0409556313993175e-06, + "loss": -0.0046, + "num_tokens": 39478017.0, + "reward": 2.1259765625, + "reward_std": 0.14834193885326385, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 763.80859375, + "completions/mean_terminated_length": 763.80859375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.030553896048476574, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16647577516713452, + "kl": 0.04156494140625, + "learning_rate": 6.075085324232083e-06, + "loss": -0.0068, + "num_tokens": 39716208.0, + "reward": 2.09765625, + "reward_std": 0.09617366641759872, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 726.3828125, + "completions/mean_terminated_length": 726.3828125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.030724588205171974, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2059817400824033, + "kl": 0.03607177734375, + "learning_rate": 6.109215017064847e-06, + "loss": 0.0044, + "num_tokens": 39940194.0, + "reward": 2.28125, + "reward_std": 0.11761415004730225, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1513.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 689.5078125, + "completions/mean_terminated_length": 689.5078125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.030895280361867374, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.18591332382611736, + "kl": 0.03656005859375, + "learning_rate": 6.143344709897611e-06, + "loss": 0.0111, + "num_tokens": 40154436.0, + "reward": 2.1181640625, + "reward_std": 0.20012402534484863, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 607.890625, + "completions/mean_terminated_length": 607.890625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.031065972518562773, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1848248423463964, + "kl": 0.0428466796875, + "learning_rate": 6.177474402730376e-06, + "loss": 0.0053, + "num_tokens": 40353256.0, + "reward": 2.1171875, + "reward_std": 0.14083804190158844, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 757.21875, + "completions/mean_terminated_length": 757.21875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.031236664675258173, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19418903797565423, + "kl": 0.04315185546875, + "learning_rate": 6.211604095563141e-06, + "loss": 0.0022, + "num_tokens": 40607872.0, + "reward": 2.10546875, + "reward_std": 0.13047361373901367, + "rewards/accuracy_reward/mean": 0.1294642835855484, + "rewards/accuracy_reward/std": 0.3364649713039398, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1615.0, + "completions/max_terminated_length": 1615.0, + "completions/mean_length": 875.30859375, + "completions/mean_terminated_length": 875.30859375, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.03140735683195357, + "frac_reward_zero_std": 0.5625, + "grad_norm": 14.305232068338565, + "kl": 0.4619140625, + "learning_rate": 6.245733788395904e-06, + "loss": 0.0231, + "num_tokens": 40868703.0, + "reward": 2.0673828125, + "reward_std": 0.1626952886581421, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.05603000521659851, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1665.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 689.046875, + "completions/mean_terminated_length": 689.046875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.03157804898864897, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.3745899185900885, + "kl": 0.041748046875, + "learning_rate": 6.27986348122867e-06, + "loss": -0.02, + "num_tokens": 41082283.0, + "reward": 0.447265625, + "reward_std": 0.4263343811035156, + "rewards/accuracy_reward/mean": 0.34765625, + "rewards/accuracy_reward/std": 0.4771590530872345, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.087890625, + "rewards/tag_count_reward/std": 0.26951271295547485, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1755.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 756.125, + "completions/mean_terminated_length": 756.125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.03174874114534437, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.33005072565708954, + "kl": 0.04498291015625, + "learning_rate": 6.313993174061434e-06, + "loss": -0.0094, + "num_tokens": 41313435.0, + "reward": 0.27734375, + "reward_std": 0.28593939542770386, + "rewards/accuracy_reward/mean": 0.21484375, + "rewards/accuracy_reward/std": 0.4115184545516968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0625, + "rewards/tag_count_reward/std": 0.233263298869133, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1703.0, + "completions/mean_length": 789.390625, + "completions/mean_terminated_length": 784.4549560546875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.03191943330203977, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.31310944193641704, + "kl": 0.04315185546875, + "learning_rate": 6.348122866894198e-06, + "loss": -0.0061, + "num_tokens": 41556495.0, + "reward": 0.265625, + "reward_std": 0.3671824336051941, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.109375, + "rewards/tag_count_reward/std": 0.29827937483787537, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 879.24609375, + "completions/mean_terminated_length": 865.3873901367188, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.03209012545873517, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3589634042930685, + "kl": 0.0838623046875, + "learning_rate": 6.382252559726962e-06, + "loss": -0.0036, + "num_tokens": 41823902.0, + "reward": 0.7255859375, + "reward_std": 0.48054248094558716, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5537109375, + "rewards/tag_count_reward/std": 0.4529813230037689, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1209.37109375, + "completions/mean_terminated_length": 1020.7798461914062, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.03226081761543057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3550463365442549, + "kl": 0.111083984375, + "learning_rate": 6.4163822525597275e-06, + "loss": 0.1415, + "num_tokens": 42183549.0, + "reward": 0.9404296875, + "reward_std": 0.41217005252838135, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7060546875, + "rewards/tag_count_reward/std": 0.3539309799671173, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1102.61328125, + "completions/mean_terminated_length": 972.3600463867188, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.03243150977212597, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39865105544880003, + "kl": 0.10595703125, + "learning_rate": 6.450511945392492e-06, + "loss": 0.1374, + "num_tokens": 42503466.0, + "reward": 0.9326171875, + "reward_std": 0.3929970860481262, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7568359375, + "rewards/tag_count_reward/std": 0.3375258147716522, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 845.6171875, + "completions/mean_terminated_length": 840.9020385742188, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.03260220192882137, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3059585960015291, + "kl": 0.0736083984375, + "learning_rate": 6.484641638225257e-06, + "loss": 0.0236, + "num_tokens": 42754984.0, + "reward": 1.255859375, + "reward_std": 0.3993765711784363, + "rewards/accuracy_reward/mean": 0.35546875, + "rewards/accuracy_reward/std": 0.4795927405357361, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.900390625, + "rewards/tag_count_reward/std": 0.2318718582391739, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 802.3359375, + "completions/mean_terminated_length": 792.5275268554688, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.03277289408551677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3638590766777407, + "kl": 0.0789794921875, + "learning_rate": 6.518771331058021e-06, + "loss": 0.0352, + "num_tokens": 43000798.0, + "reward": 0.9521484375, + "reward_std": 0.2754119634628296, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.8505859375, + "rewards/tag_count_reward/std": 0.24632525444030762, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 750.72265625, + "completions/mean_terminated_length": 745.6353149414062, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.03294358624221217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3294923619058409, + "kl": 0.06280517578125, + "learning_rate": 6.552901023890785e-06, + "loss": -0.0014, + "num_tokens": 43233815.0, + "reward": 1.2001953125, + "reward_std": 0.3050152361392975, + "rewards/accuracy_reward/mean": 0.27734375, + "rewards/accuracy_reward/std": 0.4485645890235901, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9228515625, + "rewards/tag_count_reward/std": 0.2008688896894455, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1979.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 885.4296875, + "completions/mean_terminated_length": 885.4296875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.03311427839890757, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.2702615702556501, + "kl": 0.06353759765625, + "learning_rate": 6.587030716723551e-06, + "loss": 0.0062, + "num_tokens": 43501333.0, + "reward": 1.083984375, + "reward_std": 0.19241949915885925, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.978515625, + "rewards/tag_count_reward/std": 0.0914338082075119, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1876.0, + "completions/mean_length": 988.26171875, + "completions/mean_terminated_length": 984.10595703125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.03328497055560297, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.20882697901442293, + "kl": 0.0599365234375, + "learning_rate": 6.621160409556314e-06, + "loss": 0.0225, + "num_tokens": 43800056.0, + "reward": 1.0732421875, + "reward_std": 0.21960589289665222, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.18196223676204681, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2008.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 827.97265625, + "completions/mean_terminated_length": 827.97265625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.03345566271229837, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.22997745752331367, + "kl": 0.0631103515625, + "learning_rate": 6.655290102389079e-06, + "loss": -0.0055, + "num_tokens": 44052817.0, + "reward": 1.158203125, + "reward_std": 0.14826276898384094, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.05334262177348137, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 799.1015625, + "completions/mean_terminated_length": 799.1015625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.03362635486899377, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.29869640306793144, + "kl": 0.06817626953125, + "learning_rate": 6.689419795221843e-06, + "loss": 0.0042, + "num_tokens": 44296043.0, + "reward": 1.138671875, + "reward_std": 0.17184874415397644, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.04915804788470268, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2008.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 847.37890625, + "completions/mean_terminated_length": 847.37890625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.03379704702568917, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.254783695323003, + "kl": 0.06475830078125, + "learning_rate": 6.723549488054608e-06, + "loss": -0.0114, + "num_tokens": 44556828.0, + "reward": 1.1015625, + "reward_std": 0.12433473765850067, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.08821486681699753, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 879.05859375, + "completions/mean_terminated_length": 869.8543090820312, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.03396773918238457, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1684182135591008, + "kl": 0.05755615234375, + "learning_rate": 6.757679180887372e-06, + "loss": 0.0209, + "num_tokens": 44822891.0, + "reward": 1.18359375, + "reward_std": 0.10907364636659622, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.10077822208404541, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1873.0, + "completions/max_terminated_length": 1873.0, + "completions/mean_length": 868.32421875, + "completions/mean_terminated_length": 868.32421875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.03413843133907997, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.17981740191957463, + "kl": 0.06195068359375, + "learning_rate": 6.7918088737201375e-06, + "loss": 0.0097, + "num_tokens": 45086926.0, + "reward": 1.1591796875, + "reward_std": 0.25032979249954224, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.08950243145227432, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1722.0, + "completions/max_terminated_length": 1722.0, + "completions/mean_length": 959.5390625, + "completions/mean_terminated_length": 959.5390625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.03430912349577537, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2128137729348128, + "kl": 0.058837890625, + "learning_rate": 6.825938566552902e-06, + "loss": 0.0171, + "num_tokens": 45371144.0, + "reward": 1.072265625, + "reward_std": 0.15535962581634521, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.982421875, + "rewards/tag_count_reward/std": 0.11368858069181442, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 838.6484375, + "completions/mean_terminated_length": 838.6484375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.03447981565247077, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1878062163814141, + "kl": 0.0721435546875, + "learning_rate": 6.860068259385666e-06, + "loss": 0.0071, + "num_tokens": 45624158.0, + "reward": 1.064453125, + "reward_std": 0.12515100836753845, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.09342263638973236, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 847.75, + "completions/mean_terminated_length": 843.043212890625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.03465050780916617, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.15218392106065184, + "kl": 0.06195068359375, + "learning_rate": 6.894197952218431e-06, + "loss": 0.0056, + "num_tokens": 45881326.0, + "reward": 1.119140625, + "reward_std": 0.10860462486743927, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.06976743042469025, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 916.90234375, + "completions/mean_terminated_length": 898.948486328125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.03482119996586157, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2219852544118866, + "kl": 0.0701904296875, + "learning_rate": 6.928327645051195e-06, + "loss": 0.0239, + "num_tokens": 46158725.0, + "reward": 1.0859375, + "reward_std": 0.19003704190254211, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.14665386080741882, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1679.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 765.34765625, + "completions/mean_terminated_length": 765.34765625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.03499189212255697, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.26362388512717594, + "kl": 0.0816650390625, + "learning_rate": 6.96245733788396e-06, + "loss": 0.0026, + "num_tokens": 46395982.0, + "reward": 1.169921875, + "reward_std": 0.17905542254447937, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.05828297883272171, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1781.0, + "completions/max_terminated_length": 1781.0, + "completions/mean_length": 761.44140625, + "completions/mean_terminated_length": 761.44140625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.03516258427925237, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24127572866823563, + "kl": 0.0758056640625, + "learning_rate": 6.9965870307167235e-06, + "loss": -0.0043, + "num_tokens": 46633071.0, + "reward": 1.09375, + "reward_std": 0.18541166186332703, + "rewards/accuracy_reward/mean": 0.13333334028720856, + "rewards/accuracy_reward/std": 0.34064507484436035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.17433346807956696, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 830.96875, + "completions/mean_terminated_length": 821.3858032226562, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.03533327643594777, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.25149302021910713, + "kl": 0.0704345703125, + "learning_rate": 7.030716723549489e-06, + "loss": 0.0233, + "num_tokens": 46883863.0, + "reward": 1.234375, + "reward_std": 0.30313947796821594, + "rewards/accuracy_reward/mean": 0.2666666805744171, + "rewards/accuracy_reward/std": 0.44314080476760864, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.10502100735902786, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1640.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 777.625, + "completions/mean_terminated_length": 777.625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.03550396859264317, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.25857903887521905, + "kl": 0.077392578125, + "learning_rate": 7.064846416382253e-06, + "loss": 0.0033, + "num_tokens": 47132071.0, + "reward": 1.3076171875, + "reward_std": 0.24091987311840057, + "rewards/accuracy_reward/mean": 0.30859375, + "rewards/accuracy_reward/std": 0.46281787753105164, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1986.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 852.83203125, + "completions/mean_terminated_length": 852.83203125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.03567466074933857, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1855058031216732, + "kl": 0.0787353515625, + "learning_rate": 7.098976109215017e-06, + "loss": 0.0242, + "num_tokens": 47386604.0, + "reward": 1.1240234375, + "reward_std": 0.14235913753509521, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.056234680116176605, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 886.5859375, + "completions/mean_terminated_length": 882.0314331054688, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.03584535290603397, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19606905604600214, + "kl": 0.07763671875, + "learning_rate": 7.133105802047782e-06, + "loss": 0.0126, + "num_tokens": 47659378.0, + "reward": 1.12890625, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.08821486681699753, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1846.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 813.015625, + "completions/mean_terminated_length": 813.015625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.03601604506272937, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.23608217224012384, + "kl": 0.077392578125, + "learning_rate": 7.167235494880547e-06, + "loss": 0.0185, + "num_tokens": 47917174.0, + "reward": 1.22265625, + "reward_std": 0.24329528212547302, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1624.0, + "completions/mean_length": 826.1796875, + "completions/mean_terminated_length": 821.3883056640625, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.03618673721942477, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.17663646050782303, + "kl": 0.07958984375, + "learning_rate": 7.201365187713312e-06, + "loss": 0.0153, + "num_tokens": 48169572.0, + "reward": 1.1767578125, + "reward_std": 0.12748534977436066, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.387910932302475, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.07141530513763428, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1752.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 810.2421875, + "completions/mean_terminated_length": 810.2421875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.036357429376120166, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19697240058991344, + "kl": 0.0782470703125, + "learning_rate": 7.235494880546076e-06, + "loss": 0.0134, + "num_tokens": 48419218.0, + "reward": 1.16796875, + "reward_std": 0.13898904621601105, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1728.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 810.625, + "completions/mean_terminated_length": 810.625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.036528121532815566, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.14554586860164184, + "kl": 0.0712890625, + "learning_rate": 7.2696245733788405e-06, + "loss": 0.0025, + "num_tokens": 48667362.0, + "reward": 1.2880859375, + "reward_std": 0.1387527883052826, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45421501994132996, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 951.9375, + "completions/mean_terminated_length": 951.9375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.036698813689510966, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.18653293359410897, + "kl": 0.07275390625, + "learning_rate": 7.303754266211604e-06, + "loss": 0.0039, + "num_tokens": 48953730.0, + "reward": 1.17578125, + "reward_std": 0.16197282075881958, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1867.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 856.90234375, + "completions/mean_terminated_length": 856.90234375, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.036869505846206366, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14031120108762524, + "kl": 0.0830078125, + "learning_rate": 7.33788395904437e-06, + "loss": 0.0088, + "num_tokens": 49218057.0, + "reward": 1.140625, + "reward_std": 0.1112399771809578, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1429.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 726.09765625, + "completions/mean_terminated_length": 726.09765625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.037040198002901766, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.26180734524614363, + "kl": 0.0933837890625, + "learning_rate": 7.3720136518771335e-06, + "loss": 0.0016, + "num_tokens": 49443042.0, + "reward": 1.23046875, + "reward_std": 0.1829879879951477, + "rewards/accuracy_reward/mean": 0.23046875, + "rewards/accuracy_reward/std": 0.4219578504562378, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 884.26171875, + "completions/mean_terminated_length": 879.6981201171875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.037210890159597165, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.17416160791957022, + "kl": 0.0758056640625, + "learning_rate": 7.406143344709898e-06, + "loss": 0.0009, + "num_tokens": 49707557.0, + "reward": 1.109375, + "reward_std": 0.14661693572998047, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1599.0, + "completions/max_terminated_length": 1599.0, + "completions/mean_length": 879.40234375, + "completions/mean_terminated_length": 879.40234375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.037381582316292565, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.15196131205488986, + "kl": 0.0765380859375, + "learning_rate": 7.440273037542663e-06, + "loss": 0.0062, + "num_tokens": 49971244.0, + "reward": 1.162109375, + "reward_std": 0.09908005595207214, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1741.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 904.2890625, + "completions/mean_terminated_length": 904.2890625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.037552274472987965, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16995286126554765, + "kl": 0.0721435546875, + "learning_rate": 7.474402730375427e-06, + "loss": -0.0004, + "num_tokens": 50244886.0, + "reward": 1.2177734375, + "reward_std": 0.2072596698999405, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41420844197273254, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1746.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 851.296875, + "completions/mean_terminated_length": 851.296875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.037722966629683365, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22827675534642985, + "kl": 0.083740234375, + "learning_rate": 7.508532423208191e-06, + "loss": 0.0122, + "num_tokens": 50506306.0, + "reward": 1.234375, + "reward_std": 0.2111538052558899, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1776.0, + "completions/max_terminated_length": 1776.0, + "completions/mean_length": 820.62109375, + "completions/mean_terminated_length": 818.5843505859375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.037893658786378764, + "frac_reward_zero_std": 0.6875, + "grad_norm": 4446.166626707807, + "kl": 254.0570068359375, + "learning_rate": 7.542662116040957e-06, + "loss": 10.1855, + "num_tokens": 50755393.0, + "reward": 1.158203125, + "reward_std": 0.10983732342720032, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.09076117724180222, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1677.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 822.46484375, + "completions/mean_terminated_length": 822.46484375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.038064350943074164, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1415646818487898, + "kl": 0.0782470703125, + "learning_rate": 7.57679180887372e-06, + "loss": -0.0112, + "num_tokens": 51004408.0, + "reward": 1.1484375, + "reward_std": 0.13258779048919678, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1841.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 861.73828125, + "completions/mean_terminated_length": 861.7843627929688, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.038235043099769564, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.20002174793560948, + "kl": 0.0826416015625, + "learning_rate": 7.610921501706485e-06, + "loss": -0.0059, + "num_tokens": 51267109.0, + "reward": 1.20703125, + "reward_std": 0.14711952209472656, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40597182512283325, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 909.33984375, + "completions/mean_terminated_length": 904.8745727539062, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.038405735256464964, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1548646350489163, + "kl": 0.0748291015625, + "learning_rate": 7.64505119453925e-06, + "loss": 0.0111, + "num_tokens": 51544732.0, + "reward": 1.1513671875, + "reward_std": 0.06717796623706818, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1570.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 670.765625, + "completions/mean_terminated_length": 670.765625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.038576427413160363, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.07974956262383634, + "kl": 0.10302734375, + "learning_rate": 7.679180887372013e-06, + "loss": 0.0049, + "num_tokens": 51751472.0, + "reward": 1.16796875, + "reward_std": 0.029919598251581192, + "rewards/accuracy_reward/mean": 0.17916665971279144, + "rewards/accuracy_reward/std": 0.38429322838783264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1704.0, + "completions/max_terminated_length": 1704.0, + "completions/mean_length": 812.6328125, + "completions/mean_terminated_length": 807.0236206054688, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.03874711956985576, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5446911051511829, + "kl": 0.12841796875, + "learning_rate": 7.713310580204779e-06, + "loss": 0.0022, + "num_tokens": 51995250.0, + "reward": 1.166015625, + "reward_std": 0.14247754216194153, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1949.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 834.01953125, + "completions/mean_terminated_length": 834.01953125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.03891781172655116, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.09173210530918777, + "kl": 0.087646484375, + "learning_rate": 7.747440273037543e-06, + "loss": 0.01, + "num_tokens": 52257527.0, + "reward": 1.09765625, + "reward_std": 0.057870447635650635, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1816.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 925.48828125, + "completions/mean_terminated_length": 925.48828125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.03908850388324656, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.11785529009320302, + "kl": 0.08056640625, + "learning_rate": 7.781569965870308e-06, + "loss": 0.0092, + "num_tokens": 52527300.0, + "reward": 1.23828125, + "reward_std": 0.11628375202417374, + "rewards/accuracy_reward/mean": 0.23828125, + "rewards/accuracy_reward/std": 0.4268665909767151, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 829.4921875, + "completions/mean_terminated_length": 823.9960327148438, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.03925919603994196, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20579888566317592, + "kl": 0.1004638671875, + "learning_rate": 7.815699658703072e-06, + "loss": 0.018, + "num_tokens": 52780642.0, + "reward": 1.2890625, + "reward_std": 0.1818462461233139, + "rewards/accuracy_reward/mean": 0.29296875, + "rewards/accuracy_reward/std": 0.45601576566696167, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 956.01953125, + "completions/mean_terminated_length": 943.0711669921875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.03942988819663736, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.16651743292201357, + "kl": 0.0810546875, + "learning_rate": 7.849829351535837e-06, + "loss": 0.0371, + "num_tokens": 53062919.0, + "reward": 1.1279296875, + "reward_std": 0.1360161155462265, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.034663453698158264, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1963.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 812.55078125, + "completions/mean_terminated_length": 812.55078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.03960058035333276, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16233610643660612, + "kl": 0.09765625, + "learning_rate": 7.883959044368601e-06, + "loss": 0.0023, + "num_tokens": 53309220.0, + "reward": 1.1279296875, + "reward_std": 0.09465862810611725, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 721.4375, + "completions/mean_terminated_length": 719.3176879882812, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.03977127251002816, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.4592027745927313, + "kl": 0.1259765625, + "learning_rate": 7.918088737201367e-06, + "loss": -0.0038, + "num_tokens": 53534228.0, + "reward": 1.369140625, + "reward_std": 0.1977352350950241, + "rewards/accuracy_reward/mean": 0.37109375, + "rewards/accuracy_reward/std": 0.48404383659362793, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 831.921875, + "completions/mean_terminated_length": 827.1530151367188, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.03994196466672356, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1606714260528801, + "kl": 0.087646484375, + "learning_rate": 7.95221843003413e-06, + "loss": 0.0133, + "num_tokens": 53785776.0, + "reward": 1.0771484375, + "reward_std": 0.1271505355834961, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.056234680116176605, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1681.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 828.61328125, + "completions/mean_terminated_length": 828.61328125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.04011265682341896, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24542192447420588, + "kl": 0.0924072265625, + "learning_rate": 7.986348122866894e-06, + "loss": 0.0139, + "num_tokens": 54043869.0, + "reward": 1.1123046875, + "reward_std": 0.17621229588985443, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.041130900382995605, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1544.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 725.609375, + "completions/mean_terminated_length": 725.609375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.04028334898011436, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24241984569142438, + "kl": 0.097412109375, + "learning_rate": 8.02047781569966e-06, + "loss": 0.0029, + "num_tokens": 54268729.0, + "reward": 1.2216796875, + "reward_std": 0.194053053855896, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.06436405330896378, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1747.0, + "completions/mean_length": 992.90625, + "completions/mean_terminated_length": 988.7686767578125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.04045404113680976, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.17686080828331163, + "kl": 0.080322265625, + "learning_rate": 8.054607508532423e-06, + "loss": 0.0232, + "num_tokens": 54570001.0, + "reward": 1.1328125, + "reward_std": 0.12323209643363953, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 830.3828125, + "completions/mean_terminated_length": 815.9447021484375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.04062473329350516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.24098990650866575, + "kl": 0.0926513671875, + "learning_rate": 8.088737201365189e-06, + "loss": 0.0399, + "num_tokens": 54826659.0, + "reward": 1.2451171875, + "reward_std": 0.23878328502178192, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.4382871091365814, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9873046875, + "rewards/tag_count_reward/std": 0.10884852707386017, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 830.2578125, + "completions/mean_terminated_length": 828.7333984375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.04079542545020056, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.38932885705581183, + "kl": 0.08984375, + "learning_rate": 8.122866894197953e-06, + "loss": 0.0297, + "num_tokens": 55076533.0, + "reward": 1.21484375, + "reward_std": 0.14274312555789948, + "rewards/accuracy_reward/mean": 0.21484375, + "rewards/accuracy_reward/std": 0.4115184545516968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1884.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 747.3203125, + "completions/mean_terminated_length": 747.3203125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.04096611760689596, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16297831089900555, + "kl": 0.093505859375, + "learning_rate": 8.156996587030718e-06, + "loss": 0.0092, + "num_tokens": 55306871.0, + "reward": 1.2421875, + "reward_std": 0.13776493072509766, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4292463958263397, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1781.0, + "completions/max_terminated_length": 1781.0, + "completions/mean_length": 821.921875, + "completions/mean_terminated_length": 821.921875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.04113680976359136, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.13305616574478754, + "kl": 0.08056640625, + "learning_rate": 8.191126279863482e-06, + "loss": 0.0078, + "num_tokens": 55554515.0, + "reward": 1.08984375, + "reward_std": 0.08957062661647797, + "rewards/accuracy_reward/mean": 0.09583333134651184, + "rewards/accuracy_reward/std": 0.29497772455215454, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 797.14453125, + "completions/mean_terminated_length": 797.14453125, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.04130750192028676, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1595689778332555, + "kl": 0.0865478515625, + "learning_rate": 8.225255972696247e-06, + "loss": -0.0017, + "num_tokens": 55803832.0, + "reward": 1.13671875, + "reward_std": 0.07779236882925034, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1606.0, + "completions/max_terminated_length": 1606.0, + "completions/mean_length": 785.58203125, + "completions/mean_terminated_length": 785.58203125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.04147819407698216, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20995336382173066, + "kl": 0.083740234375, + "learning_rate": 8.259385665529011e-06, + "loss": 0.0351, + "num_tokens": 56064909.0, + "reward": 1.16015625, + "reward_std": 0.1446211189031601, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.12426253408193588, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1928.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 952.08984375, + "completions/mean_terminated_length": 952.08984375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.04164888623367756, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.18847760883724018, + "kl": 0.06591796875, + "learning_rate": 8.293515358361775e-06, + "loss": 0.0087, + "num_tokens": 56346820.0, + "reward": 1.2421875, + "reward_std": 0.21933846175670624, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4292463958263397, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1648.0, + "completions/max_terminated_length": 1648.0, + "completions/mean_length": 688.13671875, + "completions/mean_terminated_length": 688.13671875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.04181957839037296, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.21177056948073306, + "kl": 0.082275390625, + "learning_rate": 8.327645051194539e-06, + "loss": 0.0086, + "num_tokens": 56565767.0, + "reward": 1.220703125, + "reward_std": 0.12078488618135452, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 805.4140625, + "completions/mean_terminated_length": 800.5411987304688, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.04199027054706836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.22798610332680666, + "kl": 0.0849609375, + "learning_rate": 8.361774744027304e-06, + "loss": 0.0245, + "num_tokens": 56812657.0, + "reward": 1.1904296875, + "reward_std": 0.24051691591739655, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.06436405330896378, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1495.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 733.65625, + "completions/mean_terminated_length": 733.65625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.04216096270376376, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16163466957156405, + "kl": 0.085693359375, + "learning_rate": 8.395904436860068e-06, + "loss": 0.0043, + "num_tokens": 57039961.0, + "reward": 1.19140625, + "reward_std": 0.11996941268444061, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.39417871832847595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1854.0, + "completions/max_terminated_length": 1854.0, + "completions/mean_length": 713.53125, + "completions/mean_terminated_length": 713.53125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.04233165486045916, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24973042082120603, + "kl": 0.080322265625, + "learning_rate": 8.430034129692833e-06, + "loss": 0.0063, + "num_tokens": 57261521.0, + "reward": 1.1865234375, + "reward_std": 0.1671188771724701, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 758.3359375, + "completions/mean_terminated_length": 753.2785034179688, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.04250234701715456, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21293441817042302, + "kl": 0.0828857421875, + "learning_rate": 8.464163822525599e-06, + "loss": 0.0218, + "num_tokens": 57502279.0, + "reward": 1.125, + "reward_std": 0.16707327961921692, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1716.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 784.78515625, + "completions/mean_terminated_length": 784.78515625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.04267303917384996, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.29960205268139223, + "kl": 0.09130859375, + "learning_rate": 8.498293515358363e-06, + "loss": -0.0054, + "num_tokens": 57748992.0, + "reward": 1.1630859375, + "reward_std": 0.18157394230365753, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2006.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 812.265625, + "completions/mean_terminated_length": 812.265625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.042843731330545365, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.23799321551980823, + "kl": 0.086669921875, + "learning_rate": 8.532423208191128e-06, + "loss": 0.0176, + "num_tokens": 57997204.0, + "reward": 1.251953125, + "reward_std": 0.21896213293075562, + "rewards/accuracy_reward/mean": 0.25390625, + "rewards/accuracy_reward/std": 0.4360972046852112, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 767.0234375, + "completions/mean_terminated_length": 767.0234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.043014423487240765, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.16990864789965415, + "kl": 0.0931396484375, + "learning_rate": 8.566552901023892e-06, + "loss": 0.0078, + "num_tokens": 58239482.0, + "reward": 1.056640625, + "reward_std": 0.06568294763565063, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23532284796237946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1914.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 735.078125, + "completions/mean_terminated_length": 735.078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.043185115643936164, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16372519403727895, + "kl": 0.097412109375, + "learning_rate": 8.600682593856656e-06, + "loss": 0.0144, + "num_tokens": 58464494.0, + "reward": 1.21875, + "reward_std": 0.10981409251689911, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41420844197273254, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1658.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 719.25, + "completions/mean_terminated_length": 719.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.043355807800631564, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19314704232886748, + "kl": 0.0980224609375, + "learning_rate": 8.63481228668942e-06, + "loss": 0.0103, + "num_tokens": 58688094.0, + "reward": 1.06640625, + "reward_std": 0.12247256934642792, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1938.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 833.50390625, + "completions/mean_terminated_length": 833.50390625, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.043526499957326964, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.13137480410195804, + "kl": 0.0836181640625, + "learning_rate": 8.668941979522185e-06, + "loss": 0.0065, + "num_tokens": 58942559.0, + "reward": 1.0810546875, + "reward_std": 0.07851861417293549, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 833.890625, + "completions/mean_terminated_length": 833.890625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.043697192114022364, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15838969555033286, + "kl": 0.0875244140625, + "learning_rate": 8.703071672354949e-06, + "loss": 0.0046, + "num_tokens": 59201779.0, + "reward": 1.171875, + "reward_std": 0.09444223344326019, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1957.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 849.3046875, + "completions/mean_terminated_length": 849.3046875, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.043867884270717764, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1823246714082369, + "kl": 0.084716796875, + "learning_rate": 8.737201365187714e-06, + "loss": 0.0309, + "num_tokens": 59463201.0, + "reward": 1.1875, + "reward_std": 0.15614622831344604, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1762.0, + "completions/mean_length": 810.72265625, + "completions/mean_terminated_length": 805.87060546875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.04403857642741316, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1904322380962351, + "kl": 0.0888671875, + "learning_rate": 8.771331058020478e-06, + "loss": -0.0041, + "num_tokens": 59713146.0, + "reward": 1.1845703125, + "reward_std": 0.16161635518074036, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.3910769522190094, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1665.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 881.2109375, + "completions/mean_terminated_length": 881.2109375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.04420926858410856, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.11566379685030664, + "kl": 0.0810546875, + "learning_rate": 8.805460750853243e-06, + "loss": 0.0007, + "num_tokens": 59982928.0, + "reward": 1.15234375, + "reward_std": 0.09186092019081116, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 852.359375, + "completions/mean_terminated_length": 847.670654296875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.04437996074080396, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.17152723095881042, + "kl": 0.0758056640625, + "learning_rate": 8.839590443686009e-06, + "loss": 0.0197, + "num_tokens": 60242396.0, + "reward": 1.2705078125, + "reward_std": 0.22794148325920105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.446596622467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1822.0, + "completions/max_terminated_length": 1822.0, + "completions/mean_length": 841.9765625, + "completions/mean_terminated_length": 841.9765625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.04455065289749936, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.22676759852645098, + "kl": 0.0911865234375, + "learning_rate": 8.873720136518773e-06, + "loss": -0.0005, + "num_tokens": 60497078.0, + "reward": 1.0576171875, + "reward_std": 0.111006960272789, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23532284796237946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 919.34375, + "completions/mean_terminated_length": 914.917724609375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.04472134505419476, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.13387973332334108, + "kl": 0.0797119140625, + "learning_rate": 8.907849829351536e-06, + "loss": 0.0141, + "num_tokens": 60778270.0, + "reward": 1.2265625, + "reward_std": 0.12356472760438919, + "rewards/accuracy_reward/mean": 0.23046875, + "rewards/accuracy_reward/std": 0.4219578504562378, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 889.9921875, + "completions/mean_terminated_length": 885.4510498046875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.04489203721089016, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12951965535533952, + "kl": 0.0826416015625, + "learning_rate": 8.9419795221843e-06, + "loss": 0.0134, + "num_tokens": 61051036.0, + "reward": 1.1728515625, + "reward_std": 0.07994156330823898, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 947.67578125, + "completions/mean_terminated_length": 925.7570190429688, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.04506272936758556, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.1888446697939172, + "kl": 0.0767822265625, + "learning_rate": 8.976109215017066e-06, + "loss": 0.0312, + "num_tokens": 61342089.0, + "reward": 1.1953125, + "reward_std": 0.17893069982528687, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.11180340498685837, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 904.43359375, + "completions/mean_terminated_length": 899.9490966796875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.04523342152428096, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12094573884299921, + "kl": 0.0853271484375, + "learning_rate": 9.01023890784983e-06, + "loss": 0.0079, + "num_tokens": 61611176.0, + "reward": 1.0166015625, + "reward_std": 0.07003937661647797, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.13865381479263306, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1889.0, + "completions/max_terminated_length": 1889.0, + "completions/mean_length": 848.10546875, + "completions/mean_terminated_length": 848.10546875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.04540411368097636, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2715331369284669, + "kl": 0.095458984375, + "learning_rate": 9.044368600682595e-06, + "loss": 0.0007, + "num_tokens": 61870787.0, + "reward": 1.16796875, + "reward_std": 0.1848640739917755, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 868.515625, + "completions/mean_terminated_length": 859.2283325195312, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.04557480583767176, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1735899748727724, + "kl": 0.086669921875, + "learning_rate": 9.078498293515359e-06, + "loss": 0.0039, + "num_tokens": 62133047.0, + "reward": 1.130859375, + "reward_std": 0.08637659251689911, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1930.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 814.44921875, + "completions/mean_terminated_length": 814.44921875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.04574549799436716, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20118688000112814, + "kl": 0.087646484375, + "learning_rate": 9.112627986348124e-06, + "loss": 0.0018, + "num_tokens": 62381258.0, + "reward": 1.08984375, + "reward_std": 0.18242931365966797, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 762.89453125, + "completions/mean_terminated_length": 762.89453125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.04591619015106256, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.1989029228867231, + "kl": 0.0916748046875, + "learning_rate": 9.146757679180888e-06, + "loss": 0.0207, + "num_tokens": 62616703.0, + "reward": 1.265625, + "reward_std": 0.17396602034568787, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.4425306022167206, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 736.98828125, + "completions/mean_terminated_length": 736.98828125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.04608688230775796, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24182482928445356, + "kl": 0.0966796875, + "learning_rate": 9.180887372013653e-06, + "loss": 0.0245, + "num_tokens": 62846044.0, + "reward": 1.28515625, + "reward_std": 0.19585952162742615, + "rewards/accuracy_reward/mean": 0.28515625, + "rewards/accuracy_reward/std": 0.4523732364177704, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1933.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 736.43359375, + "completions/mean_terminated_length": 734.0823974609375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.04625757446445336, + "frac_reward_zero_std": 0.5625, + "grad_norm": 14993280.846798413, + "kl": 395264.0754394531, + "learning_rate": 9.215017064846417e-06, + "loss": 15845.9883, + "num_tokens": 63074235.0, + "reward": 1.27734375, + "reward_std": 0.18630313873291016, + "rewards/accuracy_reward/mean": 0.2958333194255829, + "rewards/accuracy_reward/std": 0.4573703110218048, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 819.53125, + "completions/mean_terminated_length": 814.7137451171875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.04642826662114876, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20311923569749282, + "kl": 0.0950927734375, + "learning_rate": 9.249146757679181e-06, + "loss": 0.0051, + "num_tokens": 63328611.0, + "reward": 1.0537109375, + "reward_std": 0.1200973242521286, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1954.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 774.18359375, + "completions/mean_terminated_length": 774.18359375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.04659895877784416, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2335867995479536, + "kl": 0.1036376953125, + "learning_rate": 9.283276450511946e-06, + "loss": 0.0059, + "num_tokens": 63569954.0, + "reward": 1.11328125, + "reward_std": 0.1581149846315384, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 881.0859375, + "completions/mean_terminated_length": 864.2704467773438, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.04676965093453956, + "frac_reward_zero_std": 0.375, + "grad_norm": 117.9304375371266, + "kl": 6.13671875, + "learning_rate": 9.31740614334471e-06, + "loss": 0.2774, + "num_tokens": 63869512.0, + "reward": 1.1806640625, + "reward_std": 0.19755572080612183, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.19159889221191406, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1643.0, + "completions/max_terminated_length": 1643.0, + "completions/mean_length": 765.9453125, + "completions/mean_terminated_length": 765.9453125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.04694034309123496, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0614614698883888, + "kl": 0.0986328125, + "learning_rate": 9.351535836177476e-06, + "loss": 0.0111, + "num_tokens": 64108298.0, + "reward": 1.14453125, + "reward_std": 0.046542368829250336, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 850.4765625, + "completions/mean_terminated_length": 841.0472412109375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.04711103524793036, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.17496598267636146, + "kl": 0.0869140625, + "learning_rate": 9.38566552901024e-06, + "loss": 0.0191, + "num_tokens": 64360308.0, + "reward": 1.1123046875, + "reward_std": 0.12416820228099823, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.06436405330896378, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 967.39453125, + "completions/mean_terminated_length": 937.0160522460938, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.04728172740462576, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1607968448946482, + "kl": 0.08251953125, + "learning_rate": 9.419795221843005e-06, + "loss": 0.0282, + "num_tokens": 64649337.0, + "reward": 1.2646484375, + "reward_std": 0.17844277620315552, + "rewards/accuracy_reward/mean": 0.28515625, + "rewards/accuracy_reward/std": 0.4523732364177704, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9794921875, + "rewards/tag_count_reward/std": 0.12841163575649261, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1595.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 741.0859375, + "completions/mean_terminated_length": 741.0859375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.04745241956132116, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.14362726258526867, + "kl": 0.109619140625, + "learning_rate": 9.453924914675769e-06, + "loss": 0.011, + "num_tokens": 64884719.0, + "reward": 1.078125, + "reward_std": 0.04081955552101135, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1978.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 866.33203125, + "completions/mean_terminated_length": 866.33203125, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.04762311171801656, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.13581567841103678, + "kl": 0.0919189453125, + "learning_rate": 9.488054607508534e-06, + "loss": 0.0085, + "num_tokens": 65145428.0, + "reward": 1.0654296875, + "reward_std": 0.050448618829250336, + "rewards/accuracy_reward/mean": 0.07083333283662796, + "rewards/accuracy_reward/std": 0.25708237290382385, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 878.97265625, + "completions/mean_terminated_length": 821.4794921875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.04779380387471196, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.19417826558460016, + "kl": 0.083251953125, + "learning_rate": 9.522184300341298e-06, + "loss": 0.0103, + "num_tokens": 65408557.0, + "reward": 1.1982421875, + "reward_std": 0.193007230758667, + "rewards/accuracy_reward/mean": 0.24609375, + "rewards/accuracy_reward/std": 0.43157756328582764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.2086494266986847, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 936.9453125, + "completions/mean_terminated_length": 928.1968383789062, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.04796449603140736, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.18464207639430033, + "kl": 0.091552734375, + "learning_rate": 9.556313993174062e-06, + "loss": 0.0188, + "num_tokens": 65690543.0, + "reward": 1.103515625, + "reward_std": 0.13797663152217865, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.09342263638973236, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1792.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 829.359375, + "completions/mean_terminated_length": 829.359375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.04813518818810276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.21043063458585687, + "kl": 0.093017578125, + "learning_rate": 9.590443686006825e-06, + "loss": 0.002, + "num_tokens": 65943867.0, + "reward": 1.3232421875, + "reward_std": 0.20996606349945068, + "rewards/accuracy_reward/mean": 0.32421875, + "rewards/accuracy_reward/std": 0.46899911761283875, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 922.35546875, + "completions/mean_terminated_length": 913.4921264648438, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.04830588034479816, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.20632988484028808, + "kl": 0.0887451171875, + "learning_rate": 9.62457337883959e-06, + "loss": -0.0136, + "num_tokens": 66222038.0, + "reward": 1.16796875, + "reward_std": 0.2071908563375473, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 906.6171875, + "completions/mean_terminated_length": 897.6299438476562, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.04847657250149356, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.19760382931559656, + "kl": 0.0941162109375, + "learning_rate": 9.658703071672356e-06, + "loss": 0.033, + "num_tokens": 66500980.0, + "reward": 1.1318359375, + "reward_std": 0.20845861732959747, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.08950243145227432, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1926.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 847.828125, + "completions/mean_terminated_length": 847.828125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.04864726465818896, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.19825585762640746, + "kl": 0.09912109375, + "learning_rate": 9.69283276450512e-06, + "loss": 0.0081, + "num_tokens": 66755496.0, + "reward": 1.171875, + "reward_std": 0.18417394161224365, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 942.80078125, + "completions/mean_terminated_length": 942.80078125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.04881795681488436, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.252236503126728, + "kl": 0.10205078125, + "learning_rate": 9.726962457337886e-06, + "loss": 0.0098, + "num_tokens": 67038213.0, + "reward": 1.1474609375, + "reward_std": 0.17115169763565063, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1807.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 824.71484375, + "completions/mean_terminated_length": 824.71484375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.04898864897157976, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08681731164972682, + "kl": 0.0992431640625, + "learning_rate": 9.76109215017065e-06, + "loss": 0.0197, + "num_tokens": 67292140.0, + "reward": 1.0546875, + "reward_std": 0.021347813308238983, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 797.140625, + "completions/mean_terminated_length": 797.140625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.04915934112827516, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.23419671824078514, + "kl": 0.107666015625, + "learning_rate": 9.795221843003415e-06, + "loss": 0.0037, + "num_tokens": 67539680.0, + "reward": 1.2646484375, + "reward_std": 0.18893572688102722, + "rewards/accuracy_reward/mean": 0.26953125, + "rewards/accuracy_reward/std": 0.44458550214767456, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.06436405330896378, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 674.859375, + "completions/mean_terminated_length": 664.0472412109375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.049330033284970556, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.27622658700348013, + "kl": 0.13134765625, + "learning_rate": 9.829351535836179e-06, + "loss": 0.0599, + "num_tokens": 67752428.0, + "reward": 1.189453125, + "reward_std": 0.19595295190811157, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40311288833618164, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.09806257486343384, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 683.04296875, + "completions/mean_terminated_length": 683.04296875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.049500725441665956, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.24507873599629706, + "kl": 0.1119384765625, + "learning_rate": 9.863481228668942e-06, + "loss": 0.0012, + "num_tokens": 67963239.0, + "reward": 1.310546875, + "reward_std": 0.20142200589179993, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4644203782081604, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 824.0625, + "completions/mean_terminated_length": 824.0625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.049671417598361356, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20371607144516127, + "kl": 0.099609375, + "learning_rate": 9.897610921501706e-06, + "loss": 0.0097, + "num_tokens": 68216119.0, + "reward": 1.0693359375, + "reward_std": 0.17139844596385956, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 740.8359375, + "completions/mean_terminated_length": 740.8359375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.049842109755056756, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.1834183384051943, + "kl": 0.0986328125, + "learning_rate": 9.931740614334472e-06, + "loss": 0.0154, + "num_tokens": 68443661.0, + "reward": 1.33203125, + "reward_std": 0.24527794122695923, + "rewards/accuracy_reward/mean": 0.33203125, + "rewards/accuracy_reward/std": 0.4718646705150604, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2042.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 812.25390625, + "completions/mean_terminated_length": 812.25390625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.050012801911752156, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.17014260621892374, + "kl": 0.1031494140625, + "learning_rate": 9.965870307167235e-06, + "loss": 0.0044, + "num_tokens": 68694862.0, + "reward": 1.1328125, + "reward_std": 0.15448403358459473, + "rewards/accuracy_reward/mean": 0.1458333283662796, + "rewards/accuracy_reward/std": 0.3536766469478607, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.044107433408498764, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 779.66015625, + "completions/mean_terminated_length": 779.66015625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.050183494068447555, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.21024802720787317, + "kl": 0.1033935546875, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 68935943.0, + "reward": 1.2587890625, + "reward_std": 0.11779460310935974, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.4425306022167206, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.05603000521659851, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1761.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 898.77734375, + "completions/mean_terminated_length": 898.77734375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.050354186225142955, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.20843727559113884, + "kl": 0.0948486328125, + "learning_rate": 1.0034129692832766e-05, + "loss": 0.0185, + "num_tokens": 69207854.0, + "reward": 1.1708984375, + "reward_std": 0.21967318654060364, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.387910932302475, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9873046875, + "rewards/tag_count_reward/std": 0.09693823754787445, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1970.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 852.47265625, + "completions/mean_terminated_length": 852.47265625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.050524878381838355, + "frac_reward_zero_std": 0.3125, + "grad_norm": 20.840764205350837, + "kl": 1.142578125, + "learning_rate": 1.006825938566553e-05, + "loss": 0.0418, + "num_tokens": 69468647.0, + "reward": 1.201171875, + "reward_std": 0.21925190091133118, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4087733030319214, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.10053091496229172, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1850.0, + "completions/max_terminated_length": 1850.0, + "completions/mean_length": 882.0859375, + "completions/mean_terminated_length": 878.2902221679688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.050695570538533755, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.1957427398192055, + "kl": 0.110595703125, + "learning_rate": 1.0102389078498294e-05, + "loss": 0.077, + "num_tokens": 69736173.0, + "reward": 1.1611328125, + "reward_std": 0.2765665650367737, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.08203125, + "rewards/format_reward/std": 0.2749498784542084, + "rewards/tag_count_reward/mean": 0.9736328125, + "rewards/tag_count_reward/std": 0.10164377838373184, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1664.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 572.81640625, + "completions/mean_terminated_length": 572.81640625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.050866262695229154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42353727663575735, + "kl": 0.1474609375, + "learning_rate": 1.013651877133106e-05, + "loss": 0.1032, + "num_tokens": 69924814.0, + "reward": 1.298828125, + "reward_std": 0.6053951978683472, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.3828125, + "rewards/format_reward/std": 0.48702529072761536, + "rewards/tag_count_reward/mean": 0.849609375, + "rewards/tag_count_reward/std": 0.22760441899299622, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1165.0, + "completions/max_terminated_length": 1165.0, + "completions/mean_length": 296.18359375, + "completions/mean_terminated_length": 291.30316162109375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.051036954851924554, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.435847854839319, + "kl": 0.75927734375, + "learning_rate": 1.0170648464163823e-05, + "loss": 0.2279, + "num_tokens": 70038061.0, + "reward": 1.58203125, + "reward_std": 0.5972310304641724, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.66015625, + "rewards/format_reward/std": 0.47458380460739136, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.2322101891040802, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 221.85546875, + "completions/mean_terminated_length": 207.47637939453125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.051207647008619954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1010191805634546, + "kl": 0.39697265625, + "learning_rate": 1.0204778156996589e-05, + "loss": 0.3994, + "num_tokens": 70138248.0, + "reward": 1.7197265625, + "reward_std": 0.5055744647979736, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17433346807956696, + "rewards/format_reward/mean": 0.8046875, + "rewards/format_reward/std": 0.39721766114234924, + "rewards/tag_count_reward/mean": 0.8837890625, + "rewards/tag_count_reward/std": 0.24338483810424805, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 146.7109375, + "completions/mean_terminated_length": 124.166015625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.051378339165315354, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7084363163460032, + "kl": 0.630859375, + "learning_rate": 1.0238907849829352e-05, + "loss": 0.1637, + "num_tokens": 70218174.0, + "reward": 1.9072265625, + "reward_std": 0.301019549369812, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.13865381479263306, + "rewards/format_reward/mean": 0.91015625, + "rewards/format_reward/std": 0.2865179479122162, + "rewards/tag_count_reward/mean": 0.9775390625, + "rewards/tag_count_reward/std": 0.09768597036600113, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1282.0, + "completions/mean_length": 580.0625, + "completions/mean_terminated_length": 469.04205322265625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.051549031322010753, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.800513571415775, + "kl": 0.271484375, + "learning_rate": 1.0273037542662116e-05, + "loss": 0.2056, + "num_tokens": 70411950.0, + "reward": 1.8515625, + "reward_std": 0.48258018493652344, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.8359375, + "rewards/format_reward/std": 0.3710577189922333, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.12856438755989075, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 981.3125, + "completions/mean_terminated_length": 735.1538696289062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.05171972347870615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7316713197593117, + "kl": 0.31640625, + "learning_rate": 1.0307167235494882e-05, + "loss": 0.2865, + "num_tokens": 70710430.0, + "reward": 1.7412109375, + "reward_std": 0.663567841053009, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.7109375, + "rewards/format_reward/std": 0.45421501994132996, + "rewards/tag_count_reward/mean": 0.9013671875, + "rewards/tag_count_reward/std": 0.20361460745334625, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 760.796875, + "completions/mean_terminated_length": 719.274169921875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.05189041563540155, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.5200488200455134, + "kl": 0.19091796875, + "learning_rate": 1.0341296928327647e-05, + "loss": 0.0692, + "num_tokens": 70947898.0, + "reward": 2.0, + "reward_std": 0.3080005347728729, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.92578125, + "rewards/format_reward/std": 0.2626400291919708, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.0794435366988182, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 710.5703125, + "completions/mean_terminated_length": 700.0393676757812, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.05206110779209695, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.35604436019586827, + "kl": 0.1669921875, + "learning_rate": 1.0375426621160409e-05, + "loss": 0.0153, + "num_tokens": 71168124.0, + "reward": 2.1083984375, + "reward_std": 0.3562299609184265, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.94921875, + "rewards/format_reward/std": 0.21998079121112823, + "rewards/tag_count_reward/mean": 0.9794921875, + "rewards/tag_count_reward/std": 0.08755578845739365, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1535.0, + "completions/mean_length": 792.953125, + "completions/mean_terminated_length": 788.0314331054688, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.05223179994879235, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.23232669063403488, + "kl": 0.146484375, + "learning_rate": 1.0409556313993175e-05, + "loss": 0.0161, + "num_tokens": 71412992.0, + "reward": 2.005859375, + "reward_std": 0.22411835193634033, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.06110577657818794, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1635.0, + "completions/max_terminated_length": 1635.0, + "completions/mean_length": 728.5703125, + "completions/mean_terminated_length": 728.5703125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.05240249210548775, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.21335504897985444, + "kl": 0.165771484375, + "learning_rate": 1.044368600682594e-05, + "loss": 0.0062, + "num_tokens": 71638290.0, + "reward": 2.0400390625, + "reward_std": 0.10184811055660248, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 727.03515625, + "completions/mean_terminated_length": 727.03515625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.05257318426218315, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.23304387762758452, + "kl": 0.13671875, + "learning_rate": 1.0477815699658704e-05, + "loss": 0.006, + "num_tokens": 71865419.0, + "reward": 2.0947265625, + "reward_std": 0.13253937661647797, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 772.484375, + "completions/mean_terminated_length": 772.484375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.05274387641887855, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2169702979159819, + "kl": 0.142578125, + "learning_rate": 1.051194539249147e-05, + "loss": 0.006, + "num_tokens": 72103303.0, + "reward": 2.205078125, + "reward_std": 0.23298227787017822, + "rewards/accuracy_reward/mean": 0.2291666716337204, + "rewards/accuracy_reward/std": 0.4211750328540802, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 1421.0, + "completions/mean_length": 696.9453125, + "completions/mean_terminated_length": 696.9453125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.05291456857557395, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.18897281163729765, + "kl": 0.145263671875, + "learning_rate": 1.0546075085324231e-05, + "loss": 0.007, + "num_tokens": 72314553.0, + "reward": 2.0537109375, + "reward_std": 0.12351249903440475, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.034663453698158264, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 672.265625, + "completions/mean_terminated_length": 672.265625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.05308526073226935, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.20023392672320586, + "kl": 0.162353515625, + "learning_rate": 1.0580204778156997e-05, + "loss": 0.0129, + "num_tokens": 72524269.0, + "reward": 2.1572265625, + "reward_std": 0.1459827572107315, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.03488371521234512, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1567.0, + "completions/mean_length": 858.88671875, + "completions/mean_terminated_length": 854.2235717773438, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.05325595288896475, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.23542087875097267, + "kl": 0.1231689453125, + "learning_rate": 1.0614334470989762e-05, + "loss": 0.0117, + "num_tokens": 72785152.0, + "reward": 2.078125, + "reward_std": 0.25535058975219727, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.06213126704096794, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1883.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 745.03125, + "completions/mean_terminated_length": 745.03125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.05342664504566015, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2082714287377958, + "kl": 0.133544921875, + "learning_rate": 1.0648464163822528e-05, + "loss": 0.0245, + "num_tokens": 73010936.0, + "reward": 2.1572265625, + "reward_std": 0.12319592386484146, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.03488371521234512, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1568.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 627.30859375, + "completions/mean_terminated_length": 627.30859375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.05359733720235555, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2924583299820625, + "kl": 0.160888671875, + "learning_rate": 1.068259385665529e-05, + "loss": 0.0106, + "num_tokens": 73211943.0, + "reward": 2.2158203125, + "reward_std": 0.20406606793403625, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 734.1875, + "completions/mean_terminated_length": 734.1875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.05376802935905095, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2047312756198345, + "kl": 0.127197265625, + "learning_rate": 1.0716723549488055e-05, + "loss": 0.0033, + "num_tokens": 73445351.0, + "reward": 2.07421875, + "reward_std": 0.17629005014896393, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.06213126704096794, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 650.4296875, + "completions/mean_terminated_length": 650.4296875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.05393872151574635, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.20491026576113947, + "kl": 0.13720703125, + "learning_rate": 1.075085324232082e-05, + "loss": 0.0165, + "num_tokens": 73652501.0, + "reward": 2.119140625, + "reward_std": 0.17299388349056244, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1210.0, + "completions/max_terminated_length": 1210.0, + "completions/mean_length": 645.75, + "completions/mean_terminated_length": 643.5882568359375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.05410941367244175, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4147180311524535, + "kl": 0.247314453125, + "learning_rate": 1.0784982935153585e-05, + "loss": 0.0066, + "num_tokens": 73865781.0, + "reward": 2.1611328125, + "reward_std": 0.18043917417526245, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 704.13671875, + "completions/mean_terminated_length": 702.494140625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.05428010582913715, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.6496055985589988, + "kl": 0.147705078125, + "learning_rate": 1.081911262798635e-05, + "loss": 0.0075, + "num_tokens": 74083928.0, + "reward": 2.0087890625, + "reward_std": 0.12131985276937485, + "rewards/accuracy_reward/mean": 0.02500000037252903, + "rewards/accuracy_reward/std": 0.1564512401819229, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 639.17578125, + "completions/mean_terminated_length": 639.17578125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.05445079798583255, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.195471099700143, + "kl": 0.12646484375, + "learning_rate": 1.0853242320819112e-05, + "loss": 0.0098, + "num_tokens": 74284837.0, + "reward": 2.1279296875, + "reward_std": 0.1350952386856079, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 722.66796875, + "completions/mean_terminated_length": 722.66796875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.05462149014252795, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22957552521722302, + "kl": 0.1224365234375, + "learning_rate": 1.0887372013651878e-05, + "loss": 0.0099, + "num_tokens": 74515104.0, + "reward": 2.0927734375, + "reward_std": 0.22965767979621887, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.046133846044540405, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1833.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 696.0703125, + "completions/mean_terminated_length": 696.0703125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.05479218229922335, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.243948312781072, + "kl": 0.11572265625, + "learning_rate": 1.0921501706484643e-05, + "loss": 0.0132, + "num_tokens": 74738562.0, + "reward": 2.240234375, + "reward_std": 0.22372952103614807, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4338609278202057, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1433.0, + "completions/max_terminated_length": 1433.0, + "completions/mean_length": 673.00390625, + "completions/mean_terminated_length": 673.00390625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.05496287445591875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17356171347406568, + "kl": 0.12255859375, + "learning_rate": 1.0955631399317409e-05, + "loss": 0.0104, + "num_tokens": 74952995.0, + "reward": 2.1044921875, + "reward_std": 0.10905244946479797, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1509.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 668.3203125, + "completions/mean_terminated_length": 668.3203125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.05513356661261415, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.13711013450242804, + "kl": 0.1192626953125, + "learning_rate": 1.098976109215017e-05, + "loss": 0.0113, + "num_tokens": 75165509.0, + "reward": 2.0068359375, + "reward_std": 0.057062797248363495, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.1078278198838234, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 619.2734375, + "completions/mean_terminated_length": 619.2734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.05530425876930955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16718175301427948, + "kl": 0.0980224609375, + "learning_rate": 1.1023890784982936e-05, + "loss": 0.0184, + "num_tokens": 75364923.0, + "reward": 2.154296875, + "reward_std": 0.1399211585521698, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1427.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 708.77734375, + "completions/mean_terminated_length": 708.77734375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.05547495092600495, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17654906796879138, + "kl": 0.111572265625, + "learning_rate": 1.1058020477815702e-05, + "loss": 0.0074, + "num_tokens": 75591698.0, + "reward": 2.16796875, + "reward_std": 0.10881631076335907, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1598.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 626.62109375, + "completions/mean_terminated_length": 626.62109375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.05564564308270035, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.1756237328049023, + "kl": 0.1256103515625, + "learning_rate": 1.1092150170648465e-05, + "loss": 0.0101, + "num_tokens": 75788945.0, + "reward": 2.0625, + "reward_std": 0.13747048377990723, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1607.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 816.35546875, + "completions/mean_terminated_length": 816.35546875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.05581633523939575, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16880298871944605, + "kl": 0.1148681640625, + "learning_rate": 1.1126279863481229e-05, + "loss": 0.0041, + "num_tokens": 76039260.0, + "reward": 2.140625, + "reward_std": 0.10981409251689911, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 648.29296875, + "completions/mean_terminated_length": 648.29296875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.05598702739609115, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.15274787631408915, + "kl": 0.1221923828125, + "learning_rate": 1.1160409556313993e-05, + "loss": 0.0085, + "num_tokens": 76247463.0, + "reward": 2.0849609375, + "reward_std": 0.10580272227525711, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1776.0, + "completions/max_terminated_length": 1776.0, + "completions/mean_length": 787.16015625, + "completions/mean_terminated_length": 787.16015625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.05615771955278655, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.10942087700127755, + "kl": 0.1102294921875, + "learning_rate": 1.1194539249146758e-05, + "loss": 0.0136, + "num_tokens": 76492208.0, + "reward": 2.1015625, + "reward_std": 0.08384781330823898, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1780.0, + "completions/max_terminated_length": 1780.0, + "completions/mean_length": 752.2265625, + "completions/mean_terminated_length": 752.2265625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.05632841170948195, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.11500522491544173, + "kl": 0.10595703125, + "learning_rate": 1.1228668941979524e-05, + "loss": 0.0064, + "num_tokens": 76723098.0, + "reward": 2.103515625, + "reward_std": 0.09016838669776917, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1798.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 739.3203125, + "completions/mean_terminated_length": 739.3203125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.05649910386617735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19744850886643286, + "kl": 0.106201171875, + "learning_rate": 1.126279863481229e-05, + "loss": 0.0067, + "num_tokens": 76955820.0, + "reward": 2.0859375, + "reward_std": 0.15404412150382996, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 668.97265625, + "completions/mean_terminated_length": 662.3543090820312, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.05666979602287275, + "frac_reward_zero_std": 0.5, + "grad_norm": 19.90577039043971, + "kl": 2.0299072265625, + "learning_rate": 1.1296928327645051e-05, + "loss": 0.1076, + "num_tokens": 77166325.0, + "reward": 2.1796875, + "reward_std": 0.21123388409614563, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.39417871832847595, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1720.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 737.703125, + "completions/mean_terminated_length": 737.703125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.05684048817956815, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1779388593968028, + "kl": 0.1016845703125, + "learning_rate": 1.1331058020477817e-05, + "loss": 0.0144, + "num_tokens": 77393865.0, + "reward": 2.09765625, + "reward_std": 0.1469750702381134, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 735.9296875, + "completions/mean_terminated_length": 735.9296875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.057011180336263546, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21329687273716652, + "kl": 0.11474609375, + "learning_rate": 1.1365187713310582e-05, + "loss": 0.0058, + "num_tokens": 77619815.0, + "reward": 2.1845703125, + "reward_std": 0.1817634552717209, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 721.8359375, + "completions/mean_terminated_length": 721.8359375, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.057181872492958946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2507428044192771, + "kl": 0.123046875, + "learning_rate": 1.1399317406143346e-05, + "loss": 0.0115, + "num_tokens": 77844957.0, + "reward": 2.0654296875, + "reward_std": 0.161052405834198, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1645.0, + "completions/max_terminated_length": 1645.0, + "completions/mean_length": 798.0703125, + "completions/mean_terminated_length": 798.0703125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.057352564649654346, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.16529084824549267, + "kl": 0.1070556640625, + "learning_rate": 1.143344709897611e-05, + "loss": 0.0115, + "num_tokens": 78101983.0, + "reward": 2.0703125, + "reward_std": 0.08118700981140137, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1285.0, + "completions/max_terminated_length": 1285.0, + "completions/mean_length": 573.77734375, + "completions/mean_terminated_length": 573.77734375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.057523256806349746, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.22481998551357082, + "kl": 0.1260986328125, + "learning_rate": 1.1467576791808874e-05, + "loss": 0.0098, + "num_tokens": 78286406.0, + "reward": 2.17578125, + "reward_std": 0.12082535773515701, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 625.00390625, + "completions/mean_terminated_length": 625.00390625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.057693948963045145, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.24267561445232197, + "kl": 0.11865234375, + "learning_rate": 1.1501706484641639e-05, + "loss": 0.0082, + "num_tokens": 78487863.0, + "reward": 2.17578125, + "reward_std": 0.1893649697303772, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1566.0, + "completions/max_terminated_length": 1566.0, + "completions/mean_length": 696.45703125, + "completions/mean_terminated_length": 696.45703125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.057864641119740545, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19311905821435082, + "kl": 0.1201171875, + "learning_rate": 1.1535836177474405e-05, + "loss": 0.0086, + "num_tokens": 78708300.0, + "reward": 2.0771484375, + "reward_std": 0.12993110716342926, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1619.0, + "completions/max_terminated_length": 1619.0, + "completions/mean_length": 635.37109375, + "completions/mean_terminated_length": 635.37109375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.058035333276435945, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.1958544006297419, + "kl": 0.137451171875, + "learning_rate": 1.1569965870307167e-05, + "loss": 0.0067, + "num_tokens": 78913323.0, + "reward": 2.0703125, + "reward_std": 0.0687704086303711, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1331.0, + "completions/max_terminated_length": 1331.0, + "completions/mean_length": 637.4765625, + "completions/mean_terminated_length": 634.7568969726562, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.058206025433131345, + "frac_reward_zero_std": 0.4375, + "grad_norm": 33.520259817391185, + "kl": 3.639404296875, + "learning_rate": 1.1604095563139932e-05, + "loss": 0.1605, + "num_tokens": 79119285.0, + "reward": 2.259765625, + "reward_std": 0.22561872005462646, + "rewards/accuracy_reward/mean": 0.26953125, + "rewards/accuracy_reward/std": 0.44458550214767456, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 662.04296875, + "completions/mean_terminated_length": 662.04296875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.058376717589826745, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.19753770249986558, + "kl": 0.1221923828125, + "learning_rate": 1.1638225255972698e-05, + "loss": 0.0017, + "num_tokens": 79329136.0, + "reward": 2.095703125, + "reward_std": 0.10233421623706818, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 614.11328125, + "completions/mean_terminated_length": 614.11328125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.058547409746522144, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.22937706232075436, + "kl": 0.142822265625, + "learning_rate": 1.1672354948805463e-05, + "loss": 0.004, + "num_tokens": 79530301.0, + "reward": 2.09765625, + "reward_std": 0.13016413152217865, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 647.1015625, + "completions/mean_terminated_length": 647.1015625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.058718101903217544, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2270219137079811, + "kl": 0.1146240234375, + "learning_rate": 1.1706484641638227e-05, + "loss": -0.0071, + "num_tokens": 79743271.0, + "reward": 2.1318359375, + "reward_std": 0.14422571659088135, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1655.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 738.55859375, + "completions/mean_terminated_length": 738.55859375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.058888794059912944, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21353443145245057, + "kl": 0.11767578125, + "learning_rate": 1.174061433447099e-05, + "loss": 0.0033, + "num_tokens": 79974774.0, + "reward": 2.130859375, + "reward_std": 0.16317118704319, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1328.0, + "completions/max_terminated_length": 1328.0, + "completions/mean_length": 709.28515625, + "completions/mean_terminated_length": 709.28515625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.059059486216608344, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.20837148858438875, + "kl": 0.108642578125, + "learning_rate": 1.1774744027303754e-05, + "loss": 0.0094, + "num_tokens": 80195263.0, + "reward": 2.1318359375, + "reward_std": 0.1869971603155136, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 724.0390625, + "completions/mean_terminated_length": 724.0390625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.05923017837330374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2293351555438639, + "kl": 0.1026611328125, + "learning_rate": 1.180887372013652e-05, + "loss": 0.0192, + "num_tokens": 80422153.0, + "reward": 2.22265625, + "reward_std": 0.20468443632125854, + "rewards/accuracy_reward/mean": 0.23749999701976776, + "rewards/accuracy_reward/std": 0.4264404773712158, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 779.62109375, + "completions/mean_terminated_length": 769.6338500976562, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.05940087052999914, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.16521774400104278, + "kl": 0.1051025390625, + "learning_rate": 1.1843003412969285e-05, + "loss": 0.0282, + "num_tokens": 80665848.0, + "reward": 2.1103515625, + "reward_std": 0.2026018351316452, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.05603000521659851, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 703.9765625, + "completions/mean_terminated_length": 703.9765625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.05957156268669454, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.15191124643659346, + "kl": 0.105712890625, + "learning_rate": 1.1877133105802047e-05, + "loss": 0.0077, + "num_tokens": 80885138.0, + "reward": 2.16015625, + "reward_std": 0.1363661289215088, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1466.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 763.38671875, + "completions/mean_terminated_length": 763.38671875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.05974225484338994, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19504155713205373, + "kl": 0.111083984375, + "learning_rate": 1.1911262798634813e-05, + "loss": 0.0028, + "num_tokens": 81124869.0, + "reward": 2.0576171875, + "reward_std": 0.13714540004730225, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1480.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 705.75390625, + "completions/mean_terminated_length": 705.75390625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.05991294700008535, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.16248888281241722, + "kl": 0.11376953125, + "learning_rate": 1.1945392491467578e-05, + "loss": 0.0127, + "num_tokens": 81351622.0, + "reward": 2.130859375, + "reward_std": 0.15660324692726135, + "rewards/accuracy_reward/mean": 0.15416666865348816, + "rewards/accuracy_reward/std": 0.36186307668685913, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1636.0, + "completions/max_terminated_length": 1636.0, + "completions/mean_length": 756.76953125, + "completions/mean_terminated_length": 753.3215942382812, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.06008363915678075, + "frac_reward_zero_std": 0.5, + "grad_norm": 63.36751100176335, + "kl": 7.190185546875, + "learning_rate": 1.1979522184300342e-05, + "loss": 0.302, + "num_tokens": 81584763.0, + "reward": 2.0927734375, + "reward_std": 0.1775396466255188, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1828.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 694.484375, + "completions/mean_terminated_length": 694.484375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.06025433131347615, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1715363114750822, + "kl": 0.1129150390625, + "learning_rate": 1.2013651877133108e-05, + "loss": 0.0124, + "num_tokens": 81803239.0, + "reward": 2.13671875, + "reward_std": 0.1236695945262909, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1959.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 825.5390625, + "completions/mean_terminated_length": 825.5390625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.06042502347017155, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.2005941291143558, + "kl": 0.0980224609375, + "learning_rate": 1.2047781569965871e-05, + "loss": 0.0228, + "num_tokens": 82056993.0, + "reward": 2.2119140625, + "reward_std": 0.23343440890312195, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1608.0, + "completions/max_terminated_length": 1608.0, + "completions/mean_length": 742.27734375, + "completions/mean_terminated_length": 742.27734375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.06059571562686695, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19545399298975433, + "kl": 0.1126708984375, + "learning_rate": 1.2081911262798635e-05, + "loss": -0.0152, + "num_tokens": 82290632.0, + "reward": 2.0322265625, + "reward_std": 0.12109375, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 1672.0, + "completions/mean_length": 714.90625, + "completions/mean_terminated_length": 714.90625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.06076640778356235, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.2736920887836328, + "kl": 0.127685546875, + "learning_rate": 1.21160409556314e-05, + "loss": 0.0187, + "num_tokens": 82514800.0, + "reward": 2.2021484375, + "reward_std": 0.2869417667388916, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.034663453698158264, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 747.25, + "completions/mean_terminated_length": 747.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.06093709994025775, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.09614544375718699, + "kl": 0.11962890625, + "learning_rate": 1.2150170648464166e-05, + "loss": 0.0025, + "num_tokens": 82748208.0, + "reward": 2.013671875, + "reward_std": 0.05037166178226471, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 653.31640625, + "completions/mean_terminated_length": 653.31640625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.06110779209695315, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.15788420871996786, + "kl": 0.111083984375, + "learning_rate": 1.2184300341296928e-05, + "loss": 0.0059, + "num_tokens": 82953089.0, + "reward": 2.1669921875, + "reward_std": 0.12080064415931702, + "rewards/accuracy_reward/mean": 0.18333333730697632, + "rewards/accuracy_reward/std": 0.38774824142456055, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1470.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 719.3515625, + "completions/mean_terminated_length": 719.3515625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.06127848425364855, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15538609674111742, + "kl": 0.113037109375, + "learning_rate": 1.2218430034129694e-05, + "loss": 0.0107, + "num_tokens": 83184619.0, + "reward": 2.10546875, + "reward_std": 0.1005660742521286, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1736.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 740.4296875, + "completions/mean_terminated_length": 740.4296875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.06144917641034395, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.1375479944349881, + "kl": 0.1280517578125, + "learning_rate": 1.2252559726962459e-05, + "loss": 0.0164, + "num_tokens": 83421513.0, + "reward": 2.08203125, + "reward_std": 0.08461952954530716, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1307.0, + "completions/max_terminated_length": 1307.0, + "completions/mean_length": 670.921875, + "completions/mean_terminated_length": 670.921875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.06161986856703935, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.19420720252757195, + "kl": 0.13720703125, + "learning_rate": 1.2286689419795223e-05, + "loss": 0.0104, + "num_tokens": 83638885.0, + "reward": 2.095703125, + "reward_std": 0.11840523034334183, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 555.4140625, + "completions/mean_terminated_length": 553.3568725585938, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.06179056072373475, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.1167889722863835, + "kl": 0.14892578125, + "learning_rate": 1.2320819112627987e-05, + "loss": 0.0235, + "num_tokens": 83822735.0, + "reward": 2.2412109375, + "reward_std": 0.07055911421775818, + "rewards/accuracy_reward/mean": 0.24609375, + "rewards/accuracy_reward/std": 0.43157756328582764, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 696.94921875, + "completions/mean_terminated_length": 696.94921875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.06196125288043015, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.15402238243555127, + "kl": 0.130615234375, + "learning_rate": 1.2354948805460752e-05, + "loss": 0.0165, + "num_tokens": 84042146.0, + "reward": 2.0810546875, + "reward_std": 0.05078125, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1252.0, + "completions/max_terminated_length": 1252.0, + "completions/mean_length": 641.13671875, + "completions/mean_terminated_length": 641.13671875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.06213194503712555, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2138505270278251, + "kl": 0.1273193359375, + "learning_rate": 1.2389078498293516e-05, + "loss": 0.0065, + "num_tokens": 84249061.0, + "reward": 2.091796875, + "reward_std": 0.15438641607761383, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 581.00390625, + "completions/mean_terminated_length": 581.00390625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.062302637193820946, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10960814991530857, + "kl": 0.14208984375, + "learning_rate": 1.2423208191126281e-05, + "loss": 0.0064, + "num_tokens": 84440950.0, + "reward": 2.00390625, + "reward_std": 0.015625, + "rewards/accuracy_reward/mean": 0.00390625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 643.78125, + "completions/mean_terminated_length": 643.78125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.062473329350516346, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21239338222675855, + "kl": 0.135009765625, + "learning_rate": 1.2457337883959047e-05, + "loss": 0.0145, + "num_tokens": 84644590.0, + "reward": 2.20703125, + "reward_std": 0.18258976936340332, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1538.0, + "completions/max_terminated_length": 1538.0, + "completions/mean_length": 670.61328125, + "completions/mean_terminated_length": 669.61572265625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.06264402150721174, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.1024177318764035, + "kl": 0.1170654296875, + "learning_rate": 1.2491467576791809e-05, + "loss": 0.0117, + "num_tokens": 84858795.0, + "reward": 2.0908203125, + "reward_std": 0.13560330867767334, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 607.48046875, + "completions/mean_terminated_length": 607.48046875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.06281471366390715, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.18311472460025488, + "kl": 0.12451171875, + "learning_rate": 1.2525597269624574e-05, + "loss": 0.0098, + "num_tokens": 85053398.0, + "reward": 2.21875, + "reward_std": 0.1544942855834961, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41420844197273254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1237.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 596.8515625, + "completions/mean_terminated_length": 595.0823974609375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.06298540582060254, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3927040417355703, + "kl": 0.1343994140625, + "learning_rate": 1.255972696245734e-05, + "loss": 0.0242, + "num_tokens": 85243024.0, + "reward": 2.1767578125, + "reward_std": 0.15045467019081116, + "rewards/accuracy_reward/mean": 0.20416666567325592, + "rewards/accuracy_reward/std": 0.4039337933063507, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1685.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 629.0546875, + "completions/mean_terminated_length": 628.7804565429688, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.06315609797729795, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.837953820301801, + "kl": 0.15869140625, + "learning_rate": 1.2593856655290104e-05, + "loss": 0.0015, + "num_tokens": 85454542.0, + "reward": 2.1220703125, + "reward_std": 0.08566437661647797, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 661.0703125, + "completions/mean_terminated_length": 661.0703125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.06332679013399334, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.1857967432619569, + "kl": 0.1082763671875, + "learning_rate": 1.2627986348122867e-05, + "loss": 0.0126, + "num_tokens": 85663712.0, + "reward": 2.17578125, + "reward_std": 0.14979633688926697, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1823.0, + "completions/mean_length": 662.81640625, + "completions/mean_terminated_length": 657.3843383789062, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.06349748229068874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2075053908963298, + "kl": 0.1075439453125, + "learning_rate": 1.2662116040955633e-05, + "loss": 0.0259, + "num_tokens": 85880785.0, + "reward": 2.173828125, + "reward_std": 0.25343048572540283, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.387910932302475, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1724.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 729.40625, + "completions/mean_terminated_length": 729.40625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06366817444738414, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.1591282834122998, + "kl": 0.0953369140625, + "learning_rate": 1.2696245733788397e-05, + "loss": -0.0045, + "num_tokens": 86106041.0, + "reward": 2.1171875, + "reward_std": 0.11179866641759872, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1413.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 661.61328125, + "completions/mean_terminated_length": 661.61328125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.06383886660407954, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.17358157979102448, + "kl": 0.1136474609375, + "learning_rate": 1.2730375426621162e-05, + "loss": 0.0115, + "num_tokens": 86317462.0, + "reward": 2.2216796875, + "reward_std": 0.12198646366596222, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1388.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 738.21484375, + "completions/mean_terminated_length": 735.6666870117188, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.06400955876077494, + "frac_reward_zero_std": 0.625, + "grad_norm": 7.984277607399574, + "kl": 1.200927734375, + "learning_rate": 1.2764505119453924e-05, + "loss": 0.0604, + "num_tokens": 86545405.0, + "reward": 2.0703125, + "reward_std": 0.13456955552101135, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1704.0, + "completions/max_terminated_length": 1704.0, + "completions/mean_length": 832.22265625, + "completions/mean_terminated_length": 829.05908203125, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.06418025091747034, + "frac_reward_zero_std": 0.75, + "grad_norm": 279.2264693669093, + "kl": 31.25634765625, + "learning_rate": 1.279863481228669e-05, + "loss": 1.2546, + "num_tokens": 86802982.0, + "reward": 2.03515625, + "reward_std": 0.11052955687046051, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 725.796875, + "completions/mean_terminated_length": 725.796875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.06435094307416574, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.13395343718065458, + "kl": 0.1015625, + "learning_rate": 1.2832764505119455e-05, + "loss": 0.0066, + "num_tokens": 87027426.0, + "reward": 2.0625, + "reward_std": 0.0816391110420227, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 761.125, + "completions/mean_terminated_length": 750.9921264648438, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.06452163523086114, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17827799284496013, + "kl": 0.10986328125, + "learning_rate": 1.286689419795222e-05, + "loss": 0.0141, + "num_tokens": 87265794.0, + "reward": 2.119140625, + "reward_std": 0.193276047706604, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 885.26953125, + "completions/mean_terminated_length": 880.7098388671875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.06469232738755654, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.15259238522641977, + "kl": 0.1070556640625, + "learning_rate": 1.2901023890784984e-05, + "loss": 0.0269, + "num_tokens": 87528583.0, + "reward": 2.1103515625, + "reward_std": 0.14376087486743927, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1414.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 672.1171875, + "completions/mean_terminated_length": 670.674560546875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.06486301954425194, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.8240338371254247, + "kl": 0.163818359375, + "learning_rate": 1.2935153583617748e-05, + "loss": 0.0202, + "num_tokens": 87742597.0, + "reward": 2.1474609375, + "reward_std": 0.21681858599185944, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1467.0, + "completions/max_terminated_length": 1467.0, + "completions/mean_length": 712.87109375, + "completions/mean_terminated_length": 712.87109375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.06503371170094734, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.254912117538339, + "kl": 0.12939453125, + "learning_rate": 1.2969283276450513e-05, + "loss": 0.0284, + "num_tokens": 87969124.0, + "reward": 2.21875, + "reward_std": 0.2362399846315384, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41420844197273254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2007.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 766.15625, + "completions/mean_terminated_length": 766.15625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.06520440385764274, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.1649628202445497, + "kl": 0.1123046875, + "learning_rate": 1.3003412969283277e-05, + "loss": 0.0156, + "num_tokens": 88208460.0, + "reward": 2.1513671875, + "reward_std": 0.15456697344779968, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 765.6484375, + "completions/mean_terminated_length": 765.6484375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.06537509601433814, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.17415932094369443, + "kl": 0.116943359375, + "learning_rate": 1.3037542662116043e-05, + "loss": -0.009, + "num_tokens": 88446242.0, + "reward": 2.06640625, + "reward_std": 0.11553690582513809, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 701.2890625, + "completions/mean_terminated_length": 701.2890625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.06554578817103354, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.1251331262479083, + "kl": 0.1805419921875, + "learning_rate": 1.3071672354948805e-05, + "loss": 0.0066, + "num_tokens": 88665756.0, + "reward": 2.03125, + "reward_std": 0.11014671623706818, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.0625, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 1418.0, + "completions/max_terminated_length": 1418.0, + "completions/mean_length": 714.39453125, + "completions/mean_terminated_length": 708.5397338867188, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.06571648032772893, + "frac_reward_zero_std": 0.0, + "grad_norm": 337.04464061753885, + "kl": 37.0361328125, + "learning_rate": 1.310580204778157e-05, + "loss": 1.4574, + "num_tokens": 88896257.0, + "reward": 0.1767578125, + "reward_std": 0.2855737805366516, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.1220703125, + "rewards/tag_count_reward/std": 0.23219163715839386, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 663.3671875, + "completions/mean_terminated_length": 663.3671875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.06588717248442434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45012663063699593, + "kl": 0.17236328125, + "learning_rate": 1.3139931740614336e-05, + "loss": -0.0065, + "num_tokens": 89107839.0, + "reward": 0.154296875, + "reward_std": 0.21818886697292328, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.044921875, + "rewards/tag_count_reward/std": 0.12693056464195251, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 717.59765625, + "completions/mean_terminated_length": 712.2500610351562, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.06605786464111973, + "frac_reward_zero_std": 0.0, + "grad_norm": 35.60420480920372, + "kl": 4.212646484375, + "learning_rate": 1.3174061433447101e-05, + "loss": 0.1467, + "num_tokens": 89335160.0, + "reward": 0.5859375, + "reward_std": 0.31876587867736816, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.41796875, + "rewards/tag_count_reward/std": 0.2731612026691437, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1569.0, + "completions/max_terminated_length": 1569.0, + "completions/mean_length": 709.77734375, + "completions/mean_terminated_length": 705.783447265625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.06622855679781514, + "frac_reward_zero_std": 0.0, + "grad_norm": 587.778769113858, + "kl": 65.2509765625, + "learning_rate": 1.3208191126279865e-05, + "loss": 2.6275, + "num_tokens": 89556991.0, + "reward": 0.4970703125, + "reward_std": 0.3262626528739929, + "rewards/accuracy_reward/mean": 0.12916666269302368, + "rewards/accuracy_reward/std": 0.3360852301120758, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3759765625, + "rewards/tag_count_reward/std": 0.2465428113937378, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1032.51171875, + "completions/mean_terminated_length": 871.6878051757812, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.06639924895451053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1093.8966915372544, + "kl": 120.8984375, + "learning_rate": 1.3242320819112629e-05, + "loss": 4.9683, + "num_tokens": 89862882.0, + "reward": 0.458984375, + "reward_std": 0.2970699667930603, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.384765625, + "rewards/tag_count_reward/std": 0.25029855966567993, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1267.40625, + "completions/mean_terminated_length": 925.3483276367188, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.06656994111120594, + "frac_reward_zero_std": 0.0, + "grad_norm": 462.05008597737003, + "kl": 51.13818359375, + "learning_rate": 1.3276450511945394e-05, + "loss": 2.1045, + "num_tokens": 90224842.0, + "reward": 0.57421875, + "reward_std": 0.34381669759750366, + "rewards/accuracy_reward/mean": 0.28515625, + "rewards/accuracy_reward/std": 0.4523732364177704, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2890625, + "rewards/tag_count_reward/std": 0.2454238086938858, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 1473.75390625, + "completions/mean_terminated_length": 974.9561767578125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.06674063326790133, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.517574536093667, + "kl": 0.59130859375, + "learning_rate": 1.3310580204778158e-05, + "loss": 0.0469, + "num_tokens": 90647147.0, + "reward": 0.3037109375, + "reward_std": 0.2844422459602356, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2255859375, + "rewards/tag_count_reward/std": 0.2293204963207245, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -4.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1461.2265625, + "completions/mean_terminated_length": 1120.7530517578125, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.06691132542459674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2728883124287144, + "kl": 0.3037109375, + "learning_rate": 1.3344709897610923e-05, + "loss": 0.0413, + "num_tokens": 91057877.0, + "reward": 0.2939453125, + "reward_std": 0.3136971592903137, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.1962890625, + "rewards/tag_count_reward/std": 0.2175980806350708, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1128.08984375, + "completions/mean_terminated_length": 991.959716796875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.06708201758129213, + "frac_reward_zero_std": 0.0, + "grad_norm": 19.26180550508991, + "kl": 2.5263671875, + "learning_rate": 1.3378839590443686e-05, + "loss": 0.0729, + "num_tokens": 91383676.0, + "reward": 0.4599609375, + "reward_std": 0.354418009519577, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2607421875, + "rewards/tag_count_reward/std": 0.24179038405418396, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 958.03125, + "completions/mean_terminated_length": 918.3157958984375, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.06725270973798754, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.332690268657835, + "kl": 1.849609375, + "learning_rate": 1.3412969283276451e-05, + "loss": 0.0514, + "num_tokens": 91672740.0, + "reward": 0.2626953125, + "reward_std": 0.2818765938282013, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2236328125, + "rewards/tag_count_reward/std": 0.24160024523735046, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 860.484375, + "completions/mean_terminated_length": 836.8287353515625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06742340189468293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3460779099656817, + "kl": 0.24609375, + "learning_rate": 1.3447098976109216e-05, + "loss": -0.0084, + "num_tokens": 91935248.0, + "reward": 0.294921875, + "reward_std": 0.3189602494239807, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.212890625, + "rewards/tag_count_reward/std": 0.24672332406044006, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1964.0, + "completions/mean_length": 749.03125, + "completions/mean_terminated_length": 733.6284790039062, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.06759409405137834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4006212220390634, + "kl": 0.279296875, + "learning_rate": 1.3481228668941982e-05, + "loss": -0.0079, + "num_tokens": 92161736.0, + "reward": 0.3896484375, + "reward_std": 0.34170597791671753, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2490234375, + "rewards/tag_count_reward/std": 0.26055940985679626, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 761.94140625, + "completions/mean_terminated_length": 728.9797973632812, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.06776478620807373, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.075936196014757, + "kl": 0.357421875, + "learning_rate": 1.3515358361774744e-05, + "loss": 0.0362, + "num_tokens": 92393193.0, + "reward": 0.4853515625, + "reward_std": 0.31586146354675293, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.3330078125, + "rewards/tag_count_reward/std": 0.24890808761119843, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 767.00390625, + "completions/mean_terminated_length": 736.5240478515625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.06793547836476914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7989696821250276, + "kl": 0.41259765625, + "learning_rate": 1.354948805460751e-05, + "loss": 0.0229, + "num_tokens": 92636090.0, + "reward": 0.52734375, + "reward_std": 0.2859032154083252, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.421875, + "rewards/tag_count_reward/std": 0.21062465012073517, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 741.421875, + "completions/mean_terminated_length": 707.4055786132812, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.06810617052146453, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6683555004888269, + "kl": 0.2646484375, + "learning_rate": 1.3583617747440275e-05, + "loss": 0.0725, + "num_tokens": 92860630.0, + "reward": 0.5556640625, + "reward_std": 0.2628312110900879, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.4580078125, + "rewards/tag_count_reward/std": 0.18902313709259033, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 665.7734375, + "completions/mean_terminated_length": 654.8897705078125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.06827686267815994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3564578879921089, + "kl": 0.230224609375, + "learning_rate": 1.3617747440273039e-05, + "loss": 0.0041, + "num_tokens": 93069676.0, + "reward": 0.537109375, + "reward_std": 0.18444755673408508, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.521484375, + "rewards/tag_count_reward/std": 0.16576425731182098, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1664.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 643.89453125, + "completions/mean_terminated_length": 643.89453125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.06844755483485535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35281785910527913, + "kl": 0.20947265625, + "learning_rate": 1.3651877133105804e-05, + "loss": 0.0305, + "num_tokens": 93274945.0, + "reward": 0.609375, + "reward_std": 0.20120488107204437, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.56640625, + "rewards/tag_count_reward/std": 0.17982664704322815, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 664.625, + "completions/mean_terminated_length": 656.5, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.06861824699155074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4890617359010364, + "kl": 0.220947265625, + "learning_rate": 1.3686006825938566e-05, + "loss": 0.0339, + "num_tokens": 93487809.0, + "reward": 0.6982421875, + "reward_std": 0.23294128477573395, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6435546875, + "rewards/tag_count_reward/std": 0.16904421150684357, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1765.0, + "completions/max_terminated_length": 1328.0, + "completions/mean_length": 694.01171875, + "completions/mean_terminated_length": 688.8740234375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.06878893914824614, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.312998789911841, + "kl": 0.240234375, + "learning_rate": 1.3720136518771332e-05, + "loss": 0.0243, + "num_tokens": 93709412.0, + "reward": 0.8876953125, + "reward_std": 0.2579317092895508, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40597182512283325, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6806640625, + "rewards/tag_count_reward/std": 0.15601392090320587, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1194.0, + "completions/max_terminated_length": 1194.0, + "completions/mean_length": 594.34375, + "completions/mean_terminated_length": 592.435302734375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.06895963130494154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.811273478753453, + "kl": 0.259521484375, + "learning_rate": 1.3754266211604097e-05, + "loss": 0.017, + "num_tokens": 93907404.0, + "reward": 0.90234375, + "reward_std": 0.243524432182312, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.16893373429775238, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 666.59765625, + "completions/mean_terminated_length": 665.0078735351562, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.06913032346163694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.350934515350543, + "kl": 0.227294921875, + "learning_rate": 1.3788395904436863e-05, + "loss": 0.0192, + "num_tokens": 94121461.0, + "reward": 0.9111328125, + "reward_std": 0.28491097688674927, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.7626953125, + "rewards/tag_count_reward/std": 0.20069722831249237, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1583.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 666.06640625, + "completions/mean_terminated_length": 664.6627807617188, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.06930101561833234, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2876984511479743, + "kl": 0.2216796875, + "learning_rate": 1.3822525597269625e-05, + "loss": 0.0271, + "num_tokens": 94332534.0, + "reward": 1.01953125, + "reward_std": 0.22727656364440918, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.15733692049980164, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 622.265625, + "completions/mean_terminated_length": 622.265625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.06947170777502774, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2705817237797446, + "kl": 0.201904296875, + "learning_rate": 1.385665529010239e-05, + "loss": 0.018, + "num_tokens": 94529178.0, + "reward": 1.115234375, + "reward_std": 0.11386298388242722, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.056953661143779755, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1606.0, + "completions/max_terminated_length": 1606.0, + "completions/mean_length": 592.03515625, + "completions/mean_terminated_length": 590.2510375976562, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.06964239993172314, + "frac_reward_zero_std": 0.3125, + "grad_norm": 3.0893729624329858, + "kl": 0.21484375, + "learning_rate": 1.3890784982935156e-05, + "loss": 0.0152, + "num_tokens": 94721299.0, + "reward": 1.2587890625, + "reward_std": 0.22593355178833008, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.446596622467041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.0830206349492073, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1678.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 766.5859375, + "completions/mean_terminated_length": 766.5859375, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.06981309208841854, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20481473902410924, + "kl": 0.167724609375, + "learning_rate": 1.392491467576792e-05, + "loss": 0.0173, + "num_tokens": 94956793.0, + "reward": 1.130859375, + "reward_std": 0.15346664190292358, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.05775492265820503, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1518.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 711.38671875, + "completions/mean_terminated_length": 711.38671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.06998378424511394, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.16421752402844594, + "kl": 0.1865234375, + "learning_rate": 1.3959044368600683e-05, + "loss": 0.0147, + "num_tokens": 95178684.0, + "reward": 1.0166015625, + "reward_std": 0.05069809406995773, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.040850620716810226, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1520.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 756.96484375, + "completions/mean_terminated_length": 756.96484375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.07015447640180934, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1832302995552983, + "kl": 0.195556640625, + "learning_rate": 1.3993174061433447e-05, + "loss": 0.0235, + "num_tokens": 95413363.0, + "reward": 1.1201171875, + "reward_std": 0.16657234728336334, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.05575593560934067, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 786.3515625, + "completions/mean_terminated_length": 786.3515625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.07032516855850474, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.24874208164943196, + "kl": 0.164794921875, + "learning_rate": 1.4027303754266213e-05, + "loss": 0.0119, + "num_tokens": 95664653.0, + "reward": 1.2158203125, + "reward_std": 0.26775890588760376, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9814453125, + "rewards/tag_count_reward/std": 0.07274357974529266, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1394.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 721.2421875, + "completions/mean_terminated_length": 721.2421875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.07049586071520014, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.1788851982418471, + "kl": 0.18017578125, + "learning_rate": 1.4061433447098978e-05, + "loss": 0.0144, + "num_tokens": 95892139.0, + "reward": 1.0458984375, + "reward_std": 0.08866801857948303, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23532284796237946, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9873046875, + "rewards/tag_count_reward/std": 0.05928463488817215, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 763.91796875, + "completions/mean_terminated_length": 763.91796875, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.07066655287189554, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.267957542940237, + "kl": 0.187255859375, + "learning_rate": 1.4095563139931743e-05, + "loss": -0.0063, + "num_tokens": 96125878.0, + "reward": 1.0703125, + "reward_std": 0.18923771381378174, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.10145993530750275, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 694.87890625, + "completions/mean_terminated_length": 694.87890625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.07083724502859094, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.2603823787749925, + "kl": 0.197021484375, + "learning_rate": 1.4129692832764506e-05, + "loss": 0.0432, + "num_tokens": 96340279.0, + "reward": 1.1728515625, + "reward_std": 0.2390124499797821, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9736328125, + "rewards/tag_count_reward/std": 0.14361608028411865, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 606.4921875, + "completions/mean_terminated_length": 606.4921875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.07100793718528634, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.2992556847615929, + "kl": 0.1962890625, + "learning_rate": 1.4163822525597271e-05, + "loss": 0.0396, + "num_tokens": 96537445.0, + "reward": 1.0595703125, + "reward_std": 0.0804891586303711, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.05603000521659851, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1326.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 638.3671875, + "completions/mean_terminated_length": 638.3671875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.07117862934198174, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.25374952657865996, + "kl": 0.197998046875, + "learning_rate": 1.4197952218430035e-05, + "loss": 0.0236, + "num_tokens": 96742371.0, + "reward": 1.2216796875, + "reward_std": 0.18920713663101196, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41684433817863464, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.041130900382995605, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 659.25, + "completions/mean_terminated_length": 657.3372802734375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.07134932149867713, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.307338810669194, + "kl": 0.520751953125, + "learning_rate": 1.42320819112628e-05, + "loss": 0.0217, + "num_tokens": 96949907.0, + "reward": 1.154296875, + "reward_std": 0.19782650470733643, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1255.0, + "completions/mean_length": 625.62890625, + "completions/mean_terminated_length": 608.7628784179688, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.07152001365537254, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.3565367804180744, + "kl": 0.234375, + "learning_rate": 1.4266211604095564e-05, + "loss": 0.0652, + "num_tokens": 97147876.0, + "reward": 1.0439453125, + "reward_std": 0.18972669541835785, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9736328125, + "rewards/tag_count_reward/std": 0.14361608028411865, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 569.359375, + "completions/mean_terminated_length": 569.359375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.07169070581206793, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16838266878983688, + "kl": 0.22705078125, + "learning_rate": 1.4300341296928328e-05, + "loss": 0.0204, + "num_tokens": 97330720.0, + "reward": 1.0234375, + "reward_std": 0.0832536593079567, + "rewards/accuracy_reward/mean": 0.02500000037252903, + "rewards/accuracy_reward/std": 0.1564512401819229, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 546.0234375, + "completions/mean_terminated_length": 546.0234375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.07186139796876334, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23104261116704053, + "kl": 0.22412109375, + "learning_rate": 1.4334470989761093e-05, + "loss": 0.0138, + "num_tokens": 97512278.0, + "reward": 1.1591796875, + "reward_std": 0.18407510221004486, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.01171875, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 595.27734375, + "completions/mean_terminated_length": 595.27734375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.07203209012545873, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.41508388486372305, + "kl": 0.21875, + "learning_rate": 1.4368600682593859e-05, + "loss": 0.0079, + "num_tokens": 97705565.0, + "reward": 1.173828125, + "reward_std": 0.35069024562835693, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.11328125, + "rewards/format_reward/std": 0.31755712628364563, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.06185327470302582, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 582.4609375, + "completions/mean_terminated_length": 582.4609375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.07220278228215414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44384898185504357, + "kl": 0.20751953125, + "learning_rate": 1.4402730375426624e-05, + "loss": 0.0182, + "num_tokens": 97897907.0, + "reward": 1.8056640625, + "reward_std": 0.5314779281616211, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.66015625, + "rewards/format_reward/std": 0.47458380460739136, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.060245808213949203, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 588.1875, + "completions/mean_terminated_length": 588.1875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.07237347443884953, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.27982703384220087, + "kl": 0.19677734375, + "learning_rate": 1.4436860068259386e-05, + "loss": 0.0062, + "num_tokens": 98092883.0, + "reward": 2.1025390625, + "reward_std": 0.13163700699806213, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.05603000521659851, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1292.0, + "completions/max_terminated_length": 1292.0, + "completions/mean_length": 617.234375, + "completions/mean_terminated_length": 615.9176635742188, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.07254416659554494, + "frac_reward_zero_std": 0.4375, + "grad_norm": 87.62734029502326, + "kl": 5.46630859375, + "learning_rate": 1.4470989761092152e-05, + "loss": 0.231, + "num_tokens": 98300927.0, + "reward": 2.0673828125, + "reward_std": 0.1928013563156128, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9892578125, + "rewards/tag_count_reward/std": 0.0743061825633049, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 575.703125, + "completions/mean_terminated_length": 575.703125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.07271485875224033, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.35061350671097363, + "kl": 0.20458984375, + "learning_rate": 1.4505119453924915e-05, + "loss": 0.0525, + "num_tokens": 98490947.0, + "reward": 2.0185546875, + "reward_std": 0.25870639085769653, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9833984375, + "rewards/tag_count_reward/std": 0.07321586459875107, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 474.15234375, + "completions/mean_terminated_length": 471.6235656738281, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.07288555090893574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.46947265178202946, + "kl": 0.239501953125, + "learning_rate": 1.4539249146757681e-05, + "loss": 0.0488, + "num_tokens": 98657466.0, + "reward": 2.0048828125, + "reward_std": 0.18985775113105774, + "rewards/accuracy_reward/mean": 0.04583333432674408, + "rewards/accuracy_reward/std": 0.2095605432987213, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.07022564113140106, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 485.33984375, + "completions/mean_terminated_length": 485.33984375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.07305624306563113, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.3967341618649653, + "kl": 0.232666015625, + "learning_rate": 1.4573378839590445e-05, + "loss": 0.021, + "num_tokens": 98817057.0, + "reward": 1.9580078125, + "reward_std": 0.19442911446094513, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.1078278198838234, + "rewards/format_reward/mean": 0.95703125, + "rewards/format_reward/std": 0.20318391919136047, + "rewards/tag_count_reward/mean": 0.9892578125, + "rewards/tag_count_reward/std": 0.0554114393889904, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1279.0, + "completions/max_terminated_length": 1279.0, + "completions/mean_length": 558.9453125, + "completions/mean_terminated_length": 556.431396484375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.07322693522232654, + "frac_reward_zero_std": 0.625, + "grad_norm": 5.33395262510784, + "kl": 0.249267578125, + "learning_rate": 1.4607508532423209e-05, + "loss": 0.0298, + "num_tokens": 99002691.0, + "reward": 2.02734375, + "reward_std": 0.08853194117546082, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.05805254727602005, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1274.0, + "completions/max_terminated_length": 1274.0, + "completions/mean_length": 533.19921875, + "completions/mean_terminated_length": 533.19921875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.07339762737902193, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.27044408620677657, + "kl": 0.23681640625, + "learning_rate": 1.4641638225255974e-05, + "loss": 0.0098, + "num_tokens": 99181670.0, + "reward": 2.07421875, + "reward_std": 0.14029237627983093, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.03106563352048397, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 647.4140625, + "completions/mean_terminated_length": 646.157470703125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.07356831953571734, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1375390.1786725554, + "kl": 74544.10986328125, + "learning_rate": 1.467576791808874e-05, + "loss": 2993.5454, + "num_tokens": 99385200.0, + "reward": 1.9111328125, + "reward_std": 0.32506459951400757, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.18453538417816162, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2920515835285187, + "rewards/tag_count_reward/mean": 0.9697265625, + "rewards/tag_count_reward/std": 0.10986410826444626, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 566.54296875, + "completions/mean_terminated_length": 566.54296875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.07373901169241273, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.8022089774599251, + "kl": 0.242431640625, + "learning_rate": 1.4709897610921502e-05, + "loss": 0.0312, + "num_tokens": 99573451.0, + "reward": 1.7392578125, + "reward_std": 0.5162203311920166, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.6640625, + "rewards/format_reward/std": 0.4732423722743988, + "rewards/tag_count_reward/mean": 0.9189453125, + "rewards/tag_count_reward/std": 0.11932186782360077, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 640.671875, + "completions/mean_terminated_length": 640.671875, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.07390970384910814, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.30567837006315557, + "kl": 0.2412109375, + "learning_rate": 1.4744027303754267e-05, + "loss": 0.0274, + "num_tokens": 99778199.0, + "reward": 2.099609375, + "reward_std": 0.20077888667583466, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 1560.0, + "completions/max_terminated_length": 1560.0, + "completions/mean_length": 677.78515625, + "completions/mean_terminated_length": 671.1818237304688, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.07408039600580353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 17.050032506303165, + "kl": 0.291748046875, + "learning_rate": 1.4778156996587032e-05, + "loss": 0.0488, + "num_tokens": 99992880.0, + "reward": 2.037109375, + "reward_std": 0.20850998163223267, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.09025342762470245, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1163.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 652.2109375, + "completions/mean_terminated_length": 651.051025390625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.07425108816249894, + "frac_reward_zero_std": 0.3125, + "grad_norm": 8.786500889101764e+19, + "kl": 2.9723757540645274e+18, + "learning_rate": 1.4812286689419796e-05, + "loss": 1.180158142793646e+17, + "num_tokens": 100196870.0, + "reward": 2.0361328125, + "reward_std": 0.22184216976165771, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17433346807956696, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.07022564113140106, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 662.32421875, + "completions/mean_terminated_length": 660.56298828125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.07442178031919433, + "frac_reward_zero_std": 0.25, + "grad_norm": 90.49459685059662, + "kl": 6.74560546875, + "learning_rate": 1.4846416382252562e-05, + "loss": 0.2872, + "num_tokens": 100409433.0, + "reward": 2.029296875, + "reward_std": 0.22743967175483704, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.982421875, + "rewards/tag_count_reward/std": 0.09226769208908081, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1215.0, + "completions/max_terminated_length": 1215.0, + "completions/mean_length": 686.34765625, + "completions/mean_terminated_length": 683.43701171875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.07459247247588974, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3236687202504865, + "kl": 0.3564453125, + "learning_rate": 1.4880546075085325e-05, + "loss": 0.0451, + "num_tokens": 100629794.0, + "reward": 2.0, + "reward_std": 0.2658517360687256, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.19412322342395782, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08993461728096008, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 789.546875, + "completions/mean_terminated_length": 760.9273681640625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.07476316463258513, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.3903217998427972, + "kl": 0.42138671875, + "learning_rate": 1.491467576791809e-05, + "loss": 0.0997, + "num_tokens": 100883598.0, + "reward": 1.9755859375, + "reward_std": 0.3977769613265991, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24253563582897186, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.1542612463235855, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 750.22265625, + "completions/mean_terminated_length": 741.9291381835938, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.07493385678928054, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.3488499804867065, + "kl": 0.30517578125, + "learning_rate": 1.4948805460750855e-05, + "loss": 0.0581, + "num_tokens": 101119463.0, + "reward": 2.0556640625, + "reward_std": 0.1545250117778778, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.0887288972735405, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 671.85546875, + "completions/mean_terminated_length": 670.7755737304688, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.07510454894597593, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.2707509241587503, + "kl": 0.31884765625, + "learning_rate": 1.498293515358362e-05, + "loss": 0.0251, + "num_tokens": 101334434.0, + "reward": 2.037109375, + "reward_std": 0.20189441740512848, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.982421875, + "rewards/tag_count_reward/std": 0.09991955012083054, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 677.7890625, + "completions/mean_terminated_length": 674.3992309570312, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.07527524110267134, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.23735066881615863, + "kl": 0.32373046875, + "learning_rate": 1.5017064846416382e-05, + "loss": 0.0255, + "num_tokens": 101547788.0, + "reward": 2.072265625, + "reward_std": 0.1826612949371338, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.07606977969408035, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 609.671875, + "completions/mean_terminated_length": 606.68505859375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.07544593325936673, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.2758005700440931, + "kl": 0.289794921875, + "learning_rate": 1.5051194539249148e-05, + "loss": 0.0264, + "num_tokens": 101742376.0, + "reward": 2.1435546875, + "reward_std": 0.2602675259113312, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.07120048254728317, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 632.9609375, + "completions/mean_terminated_length": 625.7222290039062, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.07561662541606214, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.23598552744373277, + "kl": 0.36376953125, + "learning_rate": 1.5085324232081913e-05, + "loss": 0.0366, + "num_tokens": 101945502.0, + "reward": 2.0546875, + "reward_std": 0.169838547706604, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.09319689869880676, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 643.8984375, + "completions/mean_terminated_length": 642.4509887695312, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.07578731757275753, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.28410391948630054, + "kl": 0.268310546875, + "learning_rate": 1.5119453924914677e-05, + "loss": 0.0246, + "num_tokens": 102153572.0, + "reward": 2.22265625, + "reward_std": 0.2309720814228058, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42443734407424927, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 646.9140625, + "completions/mean_terminated_length": 646.9140625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.07595800972945294, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.21923981467578674, + "kl": 0.2294921875, + "learning_rate": 1.515358361774744e-05, + "loss": 0.0127, + "num_tokens": 102360494.0, + "reward": 2.1630859375, + "reward_std": 0.17895321547985077, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 687.51953125, + "completions/mean_terminated_length": 685.7647705078125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.07612870188614833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4788289097188592, + "kl": 0.2451171875, + "learning_rate": 1.5187713310580206e-05, + "loss": 0.013, + "num_tokens": 102579379.0, + "reward": 2.09375, + "reward_std": 0.13906793296337128, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 663.51953125, + "completions/mean_terminated_length": 663.51953125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.07629939404284374, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19702512786670434, + "kl": 0.24658203125, + "learning_rate": 1.522184300341297e-05, + "loss": 0.0132, + "num_tokens": 102791384.0, + "reward": 2.1484375, + "reward_std": 0.10189647972583771, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 644.57421875, + "completions/mean_terminated_length": 642.8980712890625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.07647008619953913, + "frac_reward_zero_std": 0.6875, + "grad_norm": 127842.25998717066, + "kl": 8832.17138671875, + "learning_rate": 1.5255972696245735e-05, + "loss": 352.4972, + "num_tokens": 102996987.0, + "reward": 2.0185546875, + "reward_std": 0.12033452093601227, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.18453538417816162, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.05169277638196945, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 692.74609375, + "completions/mean_terminated_length": 691.486328125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.07664077835623453, + "frac_reward_zero_std": 0.4375, + "grad_norm": 2.1516323365729515, + "kl": 0.21728515625, + "learning_rate": 1.52901023890785e-05, + "loss": 0.0324, + "num_tokens": 103220442.0, + "reward": 2.0107421875, + "reward_std": 0.20558327436447144, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23532284796237946, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.9873046875, + "rewards/tag_count_reward/std": 0.08623361587524414, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 649.51171875, + "completions/mean_terminated_length": 649.51171875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.07681147051292993, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.18569194320107946, + "kl": 0.214599609375, + "learning_rate": 1.5324232081911263e-05, + "loss": -0.0005, + "num_tokens": 103427149.0, + "reward": 2.1015625, + "reward_std": 0.14853152632713318, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1411.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 683.81640625, + "completions/mean_terminated_length": 682.0980834960938, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.07698216266962533, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3534858507763152, + "kl": 0.22412109375, + "learning_rate": 1.5358361774744027e-05, + "loss": 0.0035, + "num_tokens": 103645374.0, + "reward": 2.216796875, + "reward_std": 0.18045921623706818, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.41942715644836426, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 1950.13671875, + "completions/mean_terminated_length": 729.4210815429688, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.07715285482632073, + "frac_reward_zero_std": 0.0, + "grad_norm": 704778.4515854705, + "kl": 62912.0, + "learning_rate": 1.5392491467576794e-05, + "loss": 2513.9065, + "num_tokens": 104190561.0, + "reward": 0.7900390625, + "reward_std": 0.7205222845077515, + "rewards/accuracy_reward/mean": 0.1458333283662796, + "rewards/accuracy_reward/std": 0.3536766469478607, + "rewards/format_reward/mean": 0.33203125, + "rewards/format_reward/std": 0.4718646705150604, + "rewards/tag_count_reward/mean": 0.3212890625, + "rewards/tag_count_reward/std": 0.40042367577552795, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 1968.83984375, + "completions/mean_terminated_length": 781.4375, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.07732354698301613, + "frac_reward_zero_std": 0.0, + "grad_norm": 44268.182930449235, + "kl": 4584.0, + "learning_rate": 1.5426621160409558e-05, + "loss": 183.0651, + "num_tokens": 104739992.0, + "reward": 0.833984375, + "reward_std": 0.7299940586090088, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.3671875, + "rewards/format_reward/std": 0.48298248648643494, + "rewards/tag_count_reward/mean": 0.361328125, + "rewards/tag_count_reward/std": 0.39208918809890747, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 2032.51171875, + "completions/mean_terminated_length": 1056.75, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "epoch": 0.07749423913971153, + "frac_reward_zero_std": 0.0, + "grad_norm": 425.749441291451, + "kl": 41.75, + "learning_rate": 1.546075085324232e-05, + "loss": 1.6757, + "num_tokens": 105299707.0, + "reward": 0.86328125, + "reward_std": 0.7753152847290039, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.37890625, + "rewards/format_reward/std": 0.4860650300979614, + "rewards/tag_count_reward/mean": 0.3671875, + "rewards/tag_count_reward/std": 0.4039480686187744, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1232.0, + "completions/mean_length": 2029.18359375, + "completions/mean_terminated_length": 843.75, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.07766493129640693, + "frac_reward_zero_std": 0.0, + "grad_norm": 34.26045767782299, + "kl": 1.197265625, + "learning_rate": 1.5494880546075085e-05, + "loss": 0.0402, + "num_tokens": 105856010.0, + "reward": 1.0791015625, + "reward_std": 0.7693315148353577, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3268752694129944, + "rewards/format_reward/mean": 0.50390625, + "rewards/format_reward/std": 0.5009641647338867, + "rewards/tag_count_reward/mean": 0.4541015625, + "rewards/tag_count_reward/std": 0.41253969073295593, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 2034.03125, + "completions/mean_terminated_length": 856.0, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "epoch": 0.07783562345310233, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3092359545873072, + "kl": 2.078125, + "learning_rate": 1.5529010238907852e-05, + "loss": 0.0773, + "num_tokens": 106416722.0, + "reward": 1.2724609375, + "reward_std": 0.7291982173919678, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49209436774253845, + "rewards/tag_count_reward/mean": 0.6279296875, + "rewards/tag_count_reward/std": 0.4034061133861542, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 929.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 766.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07800631560979773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20869901422907058, + "kl": 3.02734375, + "learning_rate": 1.5563139931740616e-05, + "loss": 0.121, + "num_tokens": 106703218.0, + "reward": 1.802734375, + "reward_std": 0.6416635513305664, + "rewards/accuracy_reward/mean": 0.14166666567325592, + "rewards/accuracy_reward/std": 0.3494367003440857, + "rewards/format_reward/mean": 0.80078125, + "rewards/format_reward/std": 0.40019527077674866, + "rewards/tag_count_reward/mean": 0.869140625, + "rewards/tag_count_reward/std": 0.26002421975135803, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1133.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 822.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07817700776649313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15620588024506227, + "kl": 2.97265625, + "learning_rate": 1.559726962457338e-05, + "loss": 0.119, + "num_tokens": 107035042.0, + "reward": 1.86328125, + "reward_std": 0.7137435674667358, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 0.8359375, + "rewards/format_reward/std": 0.3710577189922333, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.27293679118156433, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1681.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1047.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 679.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07834769992318853, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17645303277155988, + "kl": 3.0234375, + "learning_rate": 1.5631399317406144e-05, + "loss": 0.1206, + "num_tokens": 107349106.0, + "reward": 1.7744140625, + "reward_std": 0.6813749670982361, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23532284796237946, + "rewards/format_reward/mean": 0.83203125, + "rewards/format_reward/std": 0.3745708465576172, + "rewards/tag_count_reward/mean": 0.8837890625, + "rewards/tag_count_reward/std": 0.26737815141677856, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 899.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 574.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07851839207988393, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.22885392110253924, + "kl": 2.87890625, + "learning_rate": 1.5665529010238908e-05, + "loss": 0.1152, + "num_tokens": 107623074.0, + "reward": 1.701171875, + "reward_std": 0.6433519124984741, + "rewards/accuracy_reward/mean": 0.03333333507180214, + "rewards/accuracy_reward/std": 0.17988063395023346, + "rewards/format_reward/mean": 0.80078125, + "rewards/format_reward/std": 0.40019527077674866, + "rewards/tag_count_reward/mean": 0.869140625, + "rewards/tag_count_reward/std": 0.2764708399772644, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 807.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 618.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07868908423657933, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.20369637912350358, + "kl": 3.046875, + "learning_rate": 1.5699658703071675e-05, + "loss": 0.1217, + "num_tokens": 107869874.0, + "reward": 1.669921875, + "reward_std": 0.6960712671279907, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.7734375, + "rewards/format_reward/std": 0.41942715644836426, + "rewards/tag_count_reward/mean": 0.841796875, + "rewards/tag_count_reward/std": 0.30174508690834045, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 939.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 640.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07885977639327472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17292716098243663, + "kl": 2.86328125, + "learning_rate": 1.573378839590444e-05, + "loss": 0.1145, + "num_tokens": 108152290.0, + "reward": 1.8173828125, + "reward_std": 0.6371423602104187, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3483152687549591, + "rewards/tag_count_reward/mean": 0.9033203125, + "rewards/tag_count_reward/std": 0.24887731671333313, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 955.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 640.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07903046854997013, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21003145499109221, + "kl": 2.89453125, + "learning_rate": 1.5767918088737202e-05, + "loss": 0.1158, + "num_tokens": 108442162.0, + "reward": 1.82421875, + "reward_std": 0.5868255496025085, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.8671875, + "rewards/format_reward/std": 0.3400367796421051, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.24554860591888428, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1116.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 684.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07920116070666552, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.15772173852701693, + "kl": 3.06640625, + "learning_rate": 1.5802047781569966e-05, + "loss": 0.1224, + "num_tokens": 108768210.0, + "reward": 1.748046875, + "reward_std": 0.6365604400634766, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.8046875, + "rewards/format_reward/std": 0.39721766114234924, + "rewards/tag_count_reward/mean": 0.861328125, + "rewards/tag_count_reward/std": 0.2883499264717102, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1705.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1047.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 675.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07937185286336093, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.1861306958063219, + "kl": 3.08203125, + "learning_rate": 1.5836177474402733e-05, + "loss": 0.1233, + "num_tokens": 109075490.0, + "reward": 1.9345703125, + "reward_std": 0.6521338224411011, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.83984375, + "rewards/format_reward/std": 0.36746934056282043, + "rewards/tag_count_reward/mean": 0.8955078125, + "rewards/tag_count_reward/std": 0.252573698759079, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1725.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1095.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 628.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07954254502005632, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.1673804964426501, + "kl": 2.96875, + "learning_rate": 1.5870307167235497e-05, + "loss": 0.1188, + "num_tokens": 109398178.0, + "reward": 1.77734375, + "reward_std": 0.6266515254974365, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3638034462928772, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.26150020956993103, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1505.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1070.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 803.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07971323717675173, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.1500039978511672, + "kl": 2.859375, + "learning_rate": 1.590443686006826e-05, + "loss": 0.1143, + "num_tokens": 109712626.0, + "reward": 1.880859375, + "reward_std": 0.45913994312286377, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31272050738334656, + "rewards/tag_count_reward/mean": 0.923828125, + "rewards/tag_count_reward/std": 0.22373521327972412, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 975.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 452.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.07988392933344712, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1597144286825033, + "kl": 3.25390625, + "learning_rate": 1.5938566552901024e-05, + "loss": 0.1302, + "num_tokens": 110008370.0, + "reward": 1.7373046875, + "reward_std": 0.4416307210922241, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.83203125, + "rewards/format_reward/std": 0.3745708465576172, + "rewards/tag_count_reward/mean": 0.8818359375, + "rewards/tag_count_reward/std": 0.2701708972454071, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1188.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 686.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08005462149014253, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.13710372109489158, + "kl": 2.73046875, + "learning_rate": 1.5972696245733788e-05, + "loss": 0.1092, + "num_tokens": 110352002.0, + "reward": 2.01171875, + "reward_std": 0.3350610136985779, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.19412322342395782, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.1390950232744217, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1037.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 588.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08022531364683792, + "frac_reward_zero_std": 0.4375, + "grad_norm": 2.5760007142591492, + "kl": 2.76953125, + "learning_rate": 1.6006825938566555e-05, + "loss": 0.1107, + "num_tokens": 110656562.0, + "reward": 1.98828125, + "reward_std": 0.2697985768318176, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17433346807956696, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.13075010478496552, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1134.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 680.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08039600580353333, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.7838004034413584e+19, + "kl": 2.715670575304409e+18, + "learning_rate": 1.604095563139932e-05, + "loss": 1.0862077088786022e+17, + "num_tokens": 110985634.0, + "reward": 2.03515625, + "reward_std": 0.25893932580947876, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.1269456446170807, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 937.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 614.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08056669796022872, + "frac_reward_zero_std": 0.5, + "grad_norm": 244367223382.1364, + "kl": 16190013440.0, + "learning_rate": 1.6075085324232083e-05, + "loss": 646387328.0, + "num_tokens": 111267042.0, + "reward": 2.115234375, + "reward_std": 0.2022751420736313, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1467.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1091.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 846.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08073739011692413, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.6437679608067773, + "kl": 2.9453125, + "learning_rate": 1.6109215017064847e-05, + "loss": 0.1177, + "num_tokens": 111597538.0, + "reward": 2.0703125, + "reward_std": 0.2502266764640808, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.08494158834218979, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1318.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1197.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 978.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08090808227361952, + "frac_reward_zero_std": 0.5, + "grad_norm": 732.9815226933832, + "kl": 39.0625, + "learning_rate": 1.6143344709897614e-05, + "loss": 1.5671, + "num_tokens": 111941810.0, + "reward": 2.1357421875, + "reward_std": 0.21763752400875092, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.05169277638196945, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1485.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1064.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 844.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08107877443031493, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.18356514769866444, + "kl": 2.84765625, + "learning_rate": 1.6177474402730378e-05, + "loss": 0.114, + "num_tokens": 112257714.0, + "reward": 2.078125, + "reward_std": 0.20032955706119537, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.06957504153251648, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1500.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 993.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 462.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08124946658701032, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.23508713382632085, + "kl": 2.8359375, + "learning_rate": 1.621160409556314e-05, + "loss": 0.1134, + "num_tokens": 112548418.0, + "reward": 2.0390625, + "reward_std": 0.09045085310935974, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 925.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 653.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08142015874370573, + "frac_reward_zero_std": 0.4375, + "grad_norm": 2.3041168239381884, + "kl": 2.87109375, + "learning_rate": 1.6245733788395905e-05, + "loss": 0.1149, + "num_tokens": 112826130.0, + "reward": 2.1064453125, + "reward_std": 0.24498699605464935, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1547.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 988.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 525.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08159085090040112, + "frac_reward_zero_std": 0.625, + "grad_norm": 1442.4983649885542, + "kl": 104.875, + "learning_rate": 1.627986348122867e-05, + "loss": 4.1967, + "num_tokens": 113124514.0, + "reward": 1.8134765625, + "reward_std": 0.1999577432870865, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.85546875, + "rewards/format_reward/std": 0.35231640934944153, + "rewards/tag_count_reward/mean": 0.8603515625, + "rewards/tag_count_reward/std": 0.34196704626083374, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1190.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 691.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08176154305709653, + "frac_reward_zero_std": 0.4375, + "grad_norm": 905.1603586831432, + "kl": 69.25, + "learning_rate": 1.6313993174061436e-05, + "loss": 2.7745, + "num_tokens": 113469586.0, + "reward": 2.025390625, + "reward_std": 0.20752984285354614, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1646.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1166.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 738.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08193223521379192, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.7985352004461477, + "kl": 2.390625, + "learning_rate": 1.63481228668942e-05, + "loss": 0.0956, + "num_tokens": 113806850.0, + "reward": 2.1103515625, + "reward_std": 0.28576844930648804, + "rewards/accuracy_reward/mean": 0.1666666716337204, + "rewards/accuracy_reward/std": 0.3734568655490875, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9814453125, + "rewards/tag_count_reward/std": 0.11460700631141663, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1449.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 864.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 472.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08210292737048733, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2420463126245498, + "kl": 2.96875, + "learning_rate": 1.6382252559726964e-05, + "loss": 0.1189, + "num_tokens": 114063090.0, + "reward": 2.138671875, + "reward_std": 0.20197871327400208, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1039.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 811.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08227361952718272, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.483493694921344, + "kl": 3.234375, + "learning_rate": 1.6416382252559727e-05, + "loss": 0.1293, + "num_tokens": 114373362.0, + "reward": 1.98828125, + "reward_std": 0.18505513668060303, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.1634024828672409, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.10502100735902786, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1080.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 649.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08244431168387813, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.8032292269705372, + "kl": 2.3359375, + "learning_rate": 1.6450511945392495e-05, + "loss": 0.0935, + "num_tokens": 114691394.0, + "reward": 2.169921875, + "reward_std": 0.26042434573173523, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1053.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 693.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08261500384057352, + "frac_reward_zero_std": 0.4375, + "grad_norm": 6.670960621562455, + "kl": 2.13671875, + "learning_rate": 1.648464163822526e-05, + "loss": 0.0853, + "num_tokens": 115003698.0, + "reward": 2.0537109375, + "reward_std": 0.25317761301994324, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9833984375, + "rewards/tag_count_reward/std": 0.10603261739015579, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1770.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1039.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 729.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08278569599726893, + "frac_reward_zero_std": 0.5, + "grad_norm": 248.44400740387735, + "kl": 18.71875, + "learning_rate": 1.6518771331058022e-05, + "loss": 0.7484, + "num_tokens": 115308610.0, + "reward": 2.12890625, + "reward_std": 0.2551625370979309, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.09319689869880676, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1057.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 821.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08295638815396432, + "frac_reward_zero_std": 0.5625, + "grad_norm": 78.62590683232558, + "kl": 6.4609375, + "learning_rate": 1.6552901023890786e-05, + "loss": 0.2578, + "num_tokens": 115625506.0, + "reward": 2.0751953125, + "reward_std": 0.18129241466522217, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1759.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 988.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 596.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08312708031065973, + "frac_reward_zero_std": 0.4375, + "grad_norm": 7.870257421719125, + "kl": 2.125, + "learning_rate": 1.658703071672355e-05, + "loss": 0.0851, + "num_tokens": 115922594.0, + "reward": 2.111328125, + "reward_std": 0.26417064666748047, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17433346807956696, + "rewards/tag_count_reward/mean": 0.978515625, + "rewards/tag_count_reward/std": 0.12338106334209442, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1060.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 684.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08329777246735512, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.813977707652389, + "kl": 1.9609375, + "learning_rate": 1.6621160409556317e-05, + "loss": 0.0786, + "num_tokens": 116234258.0, + "reward": 2.0625, + "reward_std": 0.23851656913757324, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.08494158834218979, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1055.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 684.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08346846462405053, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.863849028639164, + "kl": 2.296875, + "learning_rate": 1.6655290102389077e-05, + "loss": 0.0919, + "num_tokens": 116544930.0, + "reward": 1.916015625, + "reward_std": 0.11821235716342926, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.9296875, + "rewards/format_reward/std": 0.2561737895011902, + "rewards/tag_count_reward/mean": 0.931640625, + "rewards/tag_count_reward/std": 0.24993106722831726, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 956.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 789.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08363915678074592, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.347087831590834, + "kl": 1.697265625, + "learning_rate": 1.6689419795221844e-05, + "loss": 0.0679, + "num_tokens": 116828994.0, + "reward": 2.0693359375, + "reward_std": 0.2186899036169052, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9873046875, + "rewards/tag_count_reward/std": 0.09437597543001175, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1238.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 976.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 648.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08380984893744133, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.797147583983853, + "kl": 1.927734375, + "learning_rate": 1.6723549488054608e-05, + "loss": 0.0772, + "num_tokens": 117120978.0, + "reward": 1.935546875, + "reward_std": 0.19608421623706818, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.26889389753341675, + "rewards/tag_count_reward/mean": 0.943359375, + "rewards/tag_count_reward/std": 0.19732238352298737, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 941.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 598.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08398054109413672, + "frac_reward_zero_std": 0.4375, + "grad_norm": 10.157228988464652, + "kl": 1.880859375, + "learning_rate": 1.6757679180887375e-05, + "loss": 0.0753, + "num_tokens": 117404642.0, + "reward": 2.08203125, + "reward_std": 0.2047976553440094, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.06957504153251648, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1470.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 907.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 690.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08415123325083212, + "frac_reward_zero_std": 0.625, + "grad_norm": 75027.8418838825, + "kl": 7.412109375, + "learning_rate": 1.6791808873720136e-05, + "loss": 0.2962, + "num_tokens": 117680706.0, + "reward": 2.1533203125, + "reward_std": 0.16917790472507477, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3780108094215393, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1426.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 906.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 774.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08432192540752752, + "frac_reward_zero_std": 0.6875, + "grad_norm": 47.59498256921271, + "kl": 4.40625, + "learning_rate": 1.6825938566552903e-05, + "loss": 0.1763, + "num_tokens": 117956354.0, + "reward": 2.140625, + "reward_std": 0.14578913152217865, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1133.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 915.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 745.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08449261756422292, + "frac_reward_zero_std": 0.6875, + "grad_norm": 37.25496626128845, + "kl": 4.15234375, + "learning_rate": 1.6860068259385667e-05, + "loss": 0.1664, + "num_tokens": 118236210.0, + "reward": 2.0830078125, + "reward_std": 0.1197633445262909, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1518.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1114.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 700.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08466330972091832, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.82658492657792, + "kl": 1.974609375, + "learning_rate": 1.689419795221843e-05, + "loss": 0.079, + "num_tokens": 118563090.0, + "reward": 2.12109375, + "reward_std": 0.10079212486743927, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3268752694129944, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1339.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 891.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 627.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08483400187761372, + "frac_reward_zero_std": 0.375, + "grad_norm": 8.374351251544892, + "kl": 1.953125, + "learning_rate": 1.6928327645051198e-05, + "loss": 0.0781, + "num_tokens": 118834274.0, + "reward": 2.09765625, + "reward_std": 0.25487229228019714, + "rewards/accuracy_reward/mean": 0.13333334028720856, + "rewards/accuracy_reward/std": 0.34064507484436035, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.09319689869880676, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1286.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 920.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 623.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08500469403430912, + "frac_reward_zero_std": 0.6875, + "grad_norm": 4.956633647765463, + "kl": 2.18359375, + "learning_rate": 1.6962457337883958e-05, + "loss": 0.0874, + "num_tokens": 119113474.0, + "reward": 1.990234375, + "reward_std": 0.13186177611351013, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 965.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 669.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08517538619100452, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.483060123771291, + "kl": 2.41015625, + "learning_rate": 1.6996587030716725e-05, + "loss": 0.0964, + "num_tokens": 119400210.0, + "reward": 2.0419921875, + "reward_std": 0.0874568447470665, + "rewards/accuracy_reward/mean": 0.05416666716337204, + "rewards/accuracy_reward/std": 0.22681932151317596, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 899.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 675.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08534607834769992, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.6183274413313615, + "kl": 2.05859375, + "learning_rate": 1.703071672354949e-05, + "loss": 0.0825, + "num_tokens": 119668706.0, + "reward": 2.068359375, + "reward_std": 0.13647809624671936, + "rewards/accuracy_reward/mean": 0.08749999850988388, + "rewards/accuracy_reward/std": 0.2831569015979767, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 973.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 761.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08551677050439532, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.151425228635237, + "kl": 1.638671875, + "learning_rate": 1.7064846416382256e-05, + "loss": 0.0655, + "num_tokens": 119959506.0, + "reward": 2.0380859375, + "reward_std": 0.08877892792224884, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 854.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 695.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08568746266109073, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.755150120917537, + "kl": 1.671875, + "learning_rate": 1.7098976109215017e-05, + "loss": 0.0668, + "num_tokens": 120217826.0, + "reward": 2.1064453125, + "reward_std": 0.13757246732711792, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1077.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 721.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08585815481778612, + "frac_reward_zero_std": 0.5625, + "grad_norm": 14.741168896694996, + "kl": 2.341796875, + "learning_rate": 1.7133105802047784e-05, + "loss": 0.0937, + "num_tokens": 120545394.0, + "reward": 2.013671875, + "reward_std": 0.18342772126197815, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.08226180076599121, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 826.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 608.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08602884697448153, + "frac_reward_zero_std": 0.6875, + "grad_norm": 7.609567025350744, + "kl": 1.943359375, + "learning_rate": 1.7167235494880547e-05, + "loss": 0.0779, + "num_tokens": 120797922.0, + "reward": 2.1015625, + "reward_std": 0.13912242650985718, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1805.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1085.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 794.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08619953913117692, + "frac_reward_zero_std": 0.8125, + "grad_norm": 15.610145612573781, + "kl": 2.6328125, + "learning_rate": 1.720136518771331e-05, + "loss": 0.1053, + "num_tokens": 121123666.0, + "reward": 2.08203125, + "reward_std": 0.06822281330823898, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1068.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 927.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08637023128787233, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.8930309110650596, + "kl": 2.17578125, + "learning_rate": 1.723549488054608e-05, + "loss": 0.0873, + "num_tokens": 121443938.0, + "reward": 2.0087890625, + "reward_std": 0.1257619559764862, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.1634024828672409, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 932.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 416.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08654092344456772, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7508168896569634, + "kl": 1.849609375, + "learning_rate": 1.726962457337884e-05, + "loss": 0.0741, + "num_tokens": 121720178.0, + "reward": 2.19921875, + "reward_std": 0.19523435831069946, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1776.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1100.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 838.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08671161560126313, + "frac_reward_zero_std": 0.6875, + "grad_norm": 3.27244669778837, + "kl": 2.12109375, + "learning_rate": 1.7303754266211606e-05, + "loss": 0.0849, + "num_tokens": 122039026.0, + "reward": 2.09375, + "reward_std": 0.09341736882925034, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 921.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 725.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08688230775795852, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.725033351684196, + "kl": 2.58984375, + "learning_rate": 1.733788395904437e-05, + "loss": 0.1035, + "num_tokens": 122322050.0, + "reward": 2.154296875, + "reward_std": 0.19333918392658234, + "rewards/accuracy_reward/mean": 0.17916665971279144, + "rewards/accuracy_reward/std": 0.38429322838783264, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1567.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1056.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 856.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08705299991465393, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.8144192534628312, + "kl": 2.4765625, + "learning_rate": 1.7372013651877137e-05, + "loss": 0.099, + "num_tokens": 122634914.0, + "reward": 2.0703125, + "reward_std": 0.122725710272789, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1073.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 660.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08722369207134932, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.6968953115743295, + "kl": 2.4765625, + "learning_rate": 1.7406143344709897e-05, + "loss": 0.0987, + "num_tokens": 122952258.0, + "reward": 2.11328125, + "reward_std": 0.19573915004730225, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1144.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 834.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08739438422804473, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.929492481232903, + "kl": 2.31640625, + "learning_rate": 1.7440273037542664e-05, + "loss": 0.0928, + "num_tokens": 123287986.0, + "reward": 1.9814453125, + "reward_std": 0.14848542213439941, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.1039903536438942, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1088.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 729.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08756507638474012, + "frac_reward_zero_std": 0.5625, + "grad_norm": 16.200248493382286, + "kl": 4.05078125, + "learning_rate": 1.7474402730375428e-05, + "loss": 0.162, + "num_tokens": 123607250.0, + "reward": 2.09765625, + "reward_std": 0.19921278953552246, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1206.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 974.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08773576854143553, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.7177334002169278, + "kl": 1.96875, + "learning_rate": 1.7508532423208192e-05, + "loss": 0.0787, + "num_tokens": 123956514.0, + "reward": 2.078125, + "reward_std": 0.17419016361236572, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2047.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1257.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 952.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08790646069813092, + "frac_reward_zero_std": 0.6875, + "grad_norm": 8097825627.339802, + "kl": 511180800.0, + "learning_rate": 1.7542662116040956e-05, + "loss": 20448722.0, + "num_tokens": 124317730.0, + "reward": 2.0703125, + "reward_std": 0.1271837055683136, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1680.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1229.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 940.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08807715285482633, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5662936887091992, + "kl": 2.005859375, + "learning_rate": 1.757679180887372e-05, + "loss": 0.0803, + "num_tokens": 124667810.0, + "reward": 1.9951171875, + "reward_std": 0.09031975269317627, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.1078278198838234, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.05169277638196945, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1111.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 567.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08824784501152172, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.477075680170529, + "kl": 2.22265625, + "learning_rate": 1.7610921501706487e-05, + "loss": 0.089, + "num_tokens": 124988866.0, + "reward": 2.14453125, + "reward_std": 0.1972825825214386, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1487.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1104.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 923.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08841853716821713, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.8859371934447644, + "kl": 1.65234375, + "learning_rate": 1.764505119453925e-05, + "loss": 0.0661, + "num_tokens": 125314978.0, + "reward": 2.16796875, + "reward_std": 0.14383357763290405, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1627.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1157.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 950.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08858922932491252, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.402293369236075, + "kl": 2.025390625, + "learning_rate": 1.7679180887372018e-05, + "loss": 0.0811, + "num_tokens": 125658450.0, + "reward": 2.03515625, + "reward_std": 0.13855217397212982, + "rewards/accuracy_reward/mean": 0.05859375, + "rewards/accuracy_reward/std": 0.23532284796237946, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.06957504153251648, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1165.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 765.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08875992148160793, + "frac_reward_zero_std": 0.6875, + "grad_norm": 4.271214142457904, + "kl": 2.06640625, + "learning_rate": 1.7713310580204778e-05, + "loss": 0.0826, + "num_tokens": 125994562.0, + "reward": 2.0341796875, + "reward_std": 0.09842796623706818, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1214.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 748.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08893061363830332, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9470870296200733, + "kl": 1.896484375, + "learning_rate": 1.7747440273037545e-05, + "loss": 0.0758, + "num_tokens": 126345250.0, + "reward": 2.0537109375, + "reward_std": 0.20771411061286926, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1097.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 791.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08910130579499873, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3298114865988717, + "kl": 1.689453125, + "learning_rate": 1.778156996587031e-05, + "loss": 0.0676, + "num_tokens": 126666306.0, + "reward": 2.083984375, + "reward_std": 0.14705026149749756, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0661611557006836, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1469.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 732.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08927199795169412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9458246841187299, + "kl": 1.57421875, + "learning_rate": 1.7815699658703073e-05, + "loss": 0.0631, + "num_tokens": 126964946.0, + "reward": 2.1259765625, + "reward_std": 0.22715617716312408, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 985.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 788.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08944269010838952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 3.7571543747876657, + "kl": 1.623046875, + "learning_rate": 1.7849829351535836e-05, + "loss": 0.0648, + "num_tokens": 127261522.0, + "reward": 2.0966796875, + "reward_std": 0.1994314193725586, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.034663453698158264, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 973.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 645.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08961338226508492, + "frac_reward_zero_std": 0.625, + "grad_norm": 14.163174334383815, + "kl": 2.8671875, + "learning_rate": 1.78839590443686e-05, + "loss": 0.115, + "num_tokens": 127547474.0, + "reward": 2.20703125, + "reward_std": 0.14359626173973083, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1310.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1060.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 750.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08978407442178032, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4970263489587989, + "kl": 1.466796875, + "learning_rate": 1.7918088737201367e-05, + "loss": 0.0588, + "num_tokens": 127861570.0, + "reward": 2.08984375, + "reward_std": 0.08713587373495102, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1116.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 784.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08995476657847572, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4449483654293698, + "kl": 1.50390625, + "learning_rate": 1.795221843003413e-05, + "loss": 0.0601, + "num_tokens": 128193506.0, + "reward": 2.0859375, + "reward_std": 0.09686321020126343, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.03106563352048397, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1216.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 899.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 662.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09012545873517112, + "frac_reward_zero_std": 0.625, + "grad_norm": 4.984377452468171, + "kl": 2.134765625, + "learning_rate": 1.7986348122866895e-05, + "loss": 0.0854, + "num_tokens": 128466642.0, + "reward": 2.12890625, + "reward_std": 0.1379641890525818, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 984.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 751.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09029615089186652, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.24153270666939988, + "kl": 1.5234375, + "learning_rate": 1.802047781569966e-05, + "loss": 0.061, + "num_tokens": 128762882.0, + "reward": 2.21875, + "reward_std": 0.13236366212368011, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41420844197273254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 936.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 757.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09046684304856192, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7579920578948722, + "kl": 1.625, + "learning_rate": 1.8054607508532426e-05, + "loss": 0.0651, + "num_tokens": 129045682.0, + "reward": 2.046875, + "reward_std": 0.11255647987127304, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1034.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 767.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09063753520525732, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.5374870139667616, + "kl": 1.416015625, + "learning_rate": 1.808873720136519e-05, + "loss": 0.0568, + "num_tokens": 129350018.0, + "reward": 2.05859375, + "reward_std": 0.12532462179660797, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1267.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 967.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 816.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09080822736195272, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7999509704926184, + "kl": 1.400390625, + "learning_rate": 1.8122866894197953e-05, + "loss": 0.056, + "num_tokens": 129639042.0, + "reward": 2.16796875, + "reward_std": 0.24282482266426086, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38467901945114136, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1672.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1183.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 809.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09097891951864812, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.0172303354895296, + "kl": 1.48046875, + "learning_rate": 1.8156996587030717e-05, + "loss": 0.0592, + "num_tokens": 129986194.0, + "reward": 2.1923828125, + "reward_std": 0.15778368711471558, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1100.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 631.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09114961167534352, + "frac_reward_zero_std": 0.625, + "grad_norm": 18.17205368562457, + "kl": 3.1015625, + "learning_rate": 1.819112627986348e-05, + "loss": 0.1241, + "num_tokens": 130307554.0, + "reward": 2.21484375, + "reward_std": 0.18032719194889069, + "rewards/accuracy_reward/mean": 0.21484375, + "rewards/accuracy_reward/std": 0.4115184545516968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1009.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 814.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09132030383203892, + "frac_reward_zero_std": 0.6875, + "grad_norm": 38.79685366307669, + "kl": 6.453125, + "learning_rate": 1.8225255972696248e-05, + "loss": 0.2577, + "num_tokens": 130604498.0, + "reward": 2.0791015625, + "reward_std": 0.11119156330823898, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1867.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1251.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 793.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09149099598873432, + "frac_reward_zero_std": 0.5, + "grad_norm": 41.57676184121003, + "kl": 6.0703125, + "learning_rate": 1.8259385665529012e-05, + "loss": 0.2424, + "num_tokens": 130968786.0, + "reward": 2.107421875, + "reward_std": 0.20534619688987732, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 991.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 720.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09166168814542971, + "frac_reward_zero_std": 0.6875, + "grad_norm": 4.949990376963158, + "kl": 1.865234375, + "learning_rate": 1.8293515358361776e-05, + "loss": 0.0745, + "num_tokens": 131258402.0, + "reward": 2.0751953125, + "reward_std": 0.13775739073753357, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1122.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 938.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09183238030212512, + "frac_reward_zero_std": 0.375, + "grad_norm": 3.1283112105833064, + "kl": 1.587890625, + "learning_rate": 1.832764505119454e-05, + "loss": 0.0635, + "num_tokens": 131588338.0, + "reward": 2.1953125, + "reward_std": 0.2545146942138672, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.39721766114234924, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1128.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 916.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09200307245882051, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.9890205605900125, + "kl": 1.427734375, + "learning_rate": 1.8361774744027307e-05, + "loss": 0.0571, + "num_tokens": 131916002.0, + "reward": 2.1220703125, + "reward_std": 0.20345276594161987, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 984.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 718.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09217376461551592, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.7929486880745427, + "kl": 1.623046875, + "learning_rate": 1.839590443686007e-05, + "loss": 0.065, + "num_tokens": 132206386.0, + "reward": 2.1484375, + "reward_std": 0.15733106434345245, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1208.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 993.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 700.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09234445677221131, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.4398541092437005, + "kl": 1.712890625, + "learning_rate": 1.8430034129692834e-05, + "loss": 0.0685, + "num_tokens": 132497458.0, + "reward": 2.076171875, + "reward_std": 0.11200668662786484, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1465.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 997.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 623.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09251514892890672, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8519156792074762, + "kl": 1.8203125, + "learning_rate": 1.8464163822525598e-05, + "loss": 0.0728, + "num_tokens": 132795490.0, + "reward": 2.140625, + "reward_std": 0.1706668883562088, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3483152687549591, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1058.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 745.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09268584108560211, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3457995393910607, + "kl": 1.66796875, + "learning_rate": 1.8498293515358362e-05, + "loss": 0.0667, + "num_tokens": 133105138.0, + "reward": 2.1484375, + "reward_std": 0.15284234285354614, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1169.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 854.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09285653324229752, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.587865383255485, + "kl": 1.40625, + "learning_rate": 1.853242320819113e-05, + "loss": 0.0562, + "num_tokens": 133445618.0, + "reward": 2.0546875, + "reward_std": 0.06689241528511047, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1080.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 769.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09302722539899291, + "frac_reward_zero_std": 0.3125, + "grad_norm": 2.2130622585893627, + "kl": 1.369140625, + "learning_rate": 1.8566552901023893e-05, + "loss": 0.0548, + "num_tokens": 133764370.0, + "reward": 2.0400390625, + "reward_std": 0.324171245098114, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.94921875, + "rewards/format_reward/std": 0.21998079121112823, + "rewards/tag_count_reward/mean": 0.9853515625, + "rewards/tag_count_reward/std": 0.07022564113140106, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1050.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 652.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09319791755568832, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.680958438587423, + "kl": 1.90234375, + "learning_rate": 1.8600682593856656e-05, + "loss": 0.0761, + "num_tokens": 134074802.0, + "reward": 2.1201171875, + "reward_std": 0.20119357109069824, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1405.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1060.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 882.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09336860971238371, + "frac_reward_zero_std": 0.6875, + "grad_norm": 7.661677489915279, + "kl": 1.7890625, + "learning_rate": 1.863481228668942e-05, + "loss": 0.0717, + "num_tokens": 134384354.0, + "reward": 2.08984375, + "reward_std": 0.1115587055683136, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2865179479122162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1101.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 853.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09353930186907912, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.8580785724765843, + "kl": 1.302734375, + "learning_rate": 1.8668941979522187e-05, + "loss": 0.0521, + "num_tokens": 134705378.0, + "reward": 2.05859375, + "reward_std": 0.03697281330823898, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24256734549999237, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1313.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 954.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 651.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09370999402577451, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.8248735798274587, + "kl": 1.53515625, + "learning_rate": 1.870307167235495e-05, + "loss": 0.0615, + "num_tokens": 134994482.0, + "reward": 2.0849609375, + "reward_std": 0.11161822080612183, + "rewards/accuracy_reward/mean": 0.09166666865348816, + "rewards/accuracy_reward/std": 0.28915783762931824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1493.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1046.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 478.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09388068618246992, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3115288481106935, + "kl": 1.76171875, + "learning_rate": 1.8737201365187715e-05, + "loss": 0.0706, + "num_tokens": 135306194.0, + "reward": 2.14453125, + "reward_std": 0.13471892476081848, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1031.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 876.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09405137833916531, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.0291437150181606, + "kl": 1.904296875, + "learning_rate": 1.877133105802048e-05, + "loss": 0.0762, + "num_tokens": 135609682.0, + "reward": 2.072265625, + "reward_std": 0.13140258193016052, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1549.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1097.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 646.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09422207049586072, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.678216475784292, + "kl": 1.830078125, + "learning_rate": 1.8805460750853242e-05, + "loss": 0.0732, + "num_tokens": 135929010.0, + "reward": 2.1484375, + "reward_std": 0.19412808120250702, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.3600577116012573, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1218.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 901.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 459.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09439276265255611, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.6366719319567626, + "kl": 1.85546875, + "learning_rate": 1.883959044368601e-05, + "loss": 0.0743, + "num_tokens": 136200770.0, + "reward": 2.1552734375, + "reward_std": 0.17081905901432037, + "rewards/accuracy_reward/mean": 0.17083333432674408, + "rewards/accuracy_reward/std": 0.3771498203277588, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 916.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 694.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09456345480925152, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7029196515603966, + "kl": 1.78125, + "learning_rate": 1.8873720136518773e-05, + "loss": 0.0712, + "num_tokens": 136474466.0, + "reward": 2.0859375, + "reward_std": 0.10434441268444061, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1232.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 923.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 644.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09473414696594691, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.8597500796404248, + "kl": 1.4375, + "learning_rate": 1.8907849829351537e-05, + "loss": 0.0577, + "num_tokens": 136748658.0, + "reward": 2.1513671875, + "reward_std": 0.20764397084712982, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 950.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 631.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09490483912264232, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5167584531077933, + "kl": 1.298828125, + "learning_rate": 1.89419795221843e-05, + "loss": 0.052, + "num_tokens": 137035698.0, + "reward": 2.15625, + "reward_std": 0.15143245458602905, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 812.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 592.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09507553127933771, + "frac_reward_zero_std": 0.6875, + "grad_norm": 4.706090119178555, + "kl": 1.748046875, + "learning_rate": 1.8976109215017068e-05, + "loss": 0.07, + "num_tokens": 137283938.0, + "reward": 2.1484375, + "reward_std": 0.11641712486743927, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.3562295734882355, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 852.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 687.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09524622343603312, + "frac_reward_zero_std": 0.625, + "grad_norm": 15.543275038886636, + "kl": 2.3359375, + "learning_rate": 1.9010238907849832e-05, + "loss": 0.0933, + "num_tokens": 137557394.0, + "reward": 2.0654296875, + "reward_std": 0.12650328874588013, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1367.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 876.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 558.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09541691559272851, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.313063003790182, + "kl": 2.314453125, + "learning_rate": 1.9044368600682596e-05, + "loss": 0.0926, + "num_tokens": 137817426.0, + "reward": 2.04296875, + "reward_std": 0.09584102779626846, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 937.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 628.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09558760774942392, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.830963456818035, + "kl": 1.634765625, + "learning_rate": 1.907849829351536e-05, + "loss": 0.0652, + "num_tokens": 138096210.0, + "reward": 2.046875, + "reward_std": 0.0959337055683136, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1422.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 845.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 546.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09575829990611931, + "frac_reward_zero_std": 0.6875, + "grad_norm": 10.780012346386057, + "kl": 2.75390625, + "learning_rate": 1.9112627986348123e-05, + "loss": 0.1102, + "num_tokens": 138351938.0, + "reward": 2.1171875, + "reward_std": 0.11431500315666199, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 907.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 651.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09592899206281472, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8147925922676424, + "kl": 1.3203125, + "learning_rate": 1.914675767918089e-05, + "loss": 0.0529, + "num_tokens": 138625234.0, + "reward": 2.1240234375, + "reward_std": 0.12054750323295593, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33575257658958435, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1439.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1074.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 745.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09609968421951011, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5780745151205613, + "kl": 1.494140625, + "learning_rate": 1.918088737201365e-05, + "loss": 0.0597, + "num_tokens": 138936818.0, + "reward": 2.05078125, + "reward_std": 0.1005660742521286, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 960.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 675.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09627037637620552, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2488112566490094, + "kl": 1.453125, + "learning_rate": 1.9215017064846418e-05, + "loss": 0.0581, + "num_tokens": 139222514.0, + "reward": 2.19140625, + "reward_std": 0.1254390925168991, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.39417871832847595, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1020.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 734.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09644106853290091, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.1664783804474774, + "kl": 1.4609375, + "learning_rate": 1.924914675767918e-05, + "loss": 0.0584, + "num_tokens": 139525938.0, + "reward": 2.01171875, + "reward_std": 0.046875, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.1078278198838234, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 972.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 729.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09661176068959632, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.194018501154072, + "kl": 1.548828125, + "learning_rate": 1.928327645051195e-05, + "loss": 0.0619, + "num_tokens": 139826610.0, + "reward": 2.02734375, + "reward_std": 0.06789018213748932, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.1634024828672409, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 845.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 604.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09678245284629171, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.7480142032671724, + "kl": 1.455078125, + "learning_rate": 1.9317406143344713e-05, + "loss": 0.0581, + "num_tokens": 140080882.0, + "reward": 2.1748046875, + "reward_std": 0.20768529176712036, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3813795745372772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 850.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 524.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09695314500298712, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5945450099895025, + "kl": 1.439453125, + "learning_rate": 1.9351535836177476e-05, + "loss": 0.0576, + "num_tokens": 140334914.0, + "reward": 2.06640625, + "reward_std": 0.06822281330823898, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1671.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1110.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 764.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09712383715968251, + "frac_reward_zero_std": 0.625, + "grad_norm": 5.0094781822786585, + "kl": 1.79296875, + "learning_rate": 1.938566552901024e-05, + "loss": 0.0719, + "num_tokens": 140658706.0, + "reward": 2.1591796875, + "reward_std": 0.12354303896427155, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.36746934056282043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 988.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 836.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09729452931637791, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.931213150543606, + "kl": 1.48828125, + "learning_rate": 1.9419795221843004e-05, + "loss": 0.0595, + "num_tokens": 140961058.0, + "reward": 2.2529296875, + "reward_std": 0.172615647315979, + "rewards/accuracy_reward/mean": 0.25390625, + "rewards/accuracy_reward/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1008.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 723.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09746522147307331, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9814950862817222, + "kl": 1.34375, + "learning_rate": 1.945392491467577e-05, + "loss": 0.0538, + "num_tokens": 141262674.0, + "reward": 2.09375, + "reward_std": 0.09341736882925034, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1838.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1263.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 745.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09763591362976871, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3340902500703746, + "kl": 1.30859375, + "learning_rate": 1.948805460750853e-05, + "loss": 0.0524, + "num_tokens": 141627634.0, + "reward": 2.1240234375, + "reward_std": 0.20545923709869385, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.05169277638196945, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1184.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 988.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09780660578646411, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1671564481814214, + "kl": 1.158203125, + "learning_rate": 1.95221843003413e-05, + "loss": 0.0464, + "num_tokens": 141978898.0, + "reward": 2.1396484375, + "reward_std": 0.15192194283008575, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35231640934944153, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1480.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1047.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 611.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09797729794315951, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.26798213164879225, + "kl": 1.447265625, + "learning_rate": 1.9556313993174062e-05, + "loss": 0.0579, + "num_tokens": 142291570.0, + "reward": 2.126953125, + "reward_std": 0.12407582998275757, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1118.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 646.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0981479900998549, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4379714131187351, + "kl": 1.26171875, + "learning_rate": 1.959044368600683e-05, + "loss": 0.0505, + "num_tokens": 142619058.0, + "reward": 2.123046875, + "reward_std": 0.15480339527130127, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3400367796421051, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1641.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1131.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 800.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09831868225655031, + "frac_reward_zero_std": 0.625, + "grad_norm": 3.5419167193551866, + "kl": 1.55859375, + "learning_rate": 1.9624573378839593e-05, + "loss": 0.0622, + "num_tokens": 142954450.0, + "reward": 2.0498046875, + "reward_std": 0.15098702907562256, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.034663453698158264, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1035.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 765.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0984893744132457, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1535323268208488, + "kl": 1.20703125, + "learning_rate": 1.9658703071672357e-05, + "loss": 0.0483, + "num_tokens": 143261970.0, + "reward": 2.0810546875, + "reward_std": 0.13617250323295593, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28082075715065, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1064.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 594.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09866006656994111, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1.3578920569056516, + "kl": 1.505859375, + "learning_rate": 1.969283276450512e-05, + "loss": 0.0602, + "num_tokens": 143575490.0, + "reward": 1.98828125, + "reward_std": 0.37720486521720886, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24253563582897186, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.11173487454652786, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0988307587266365, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.8637286136950177, + "kl": 4.5078125, + "learning_rate": 1.9726962457337885e-05, + "loss": 0.1804, + "num_tokens": 144141122.0, + "reward": 0.01171875, + "reward_std": 0.032844457775354385, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.01171875, + "rewards/tag_count_reward/std": 0.057389069348573685, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1976.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1475.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09900145088333191, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9742546959182923, + "kl": 4.2421875, + "learning_rate": 1.9761092150170652e-05, + "loss": 0.1698, + "num_tokens": 144687122.0, + "reward": 0.01171875, + "reward_std": 0.03529931977391243, + "rewards/accuracy_reward/mean": 0.00390625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0078125, + "rewards/tag_count_reward/std": 0.04358336701989174, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 914.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0991721430400273, + "frac_reward_zero_std": 0.0625, + "grad_norm": 5.197672750593691, + "kl": 3.8359375, + "learning_rate": 1.9795221843003412e-05, + "loss": 0.1531, + "num_tokens": 144960530.0, + "reward": 0.08203125, + "reward_std": 0.12339059263467789, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.08203125, + "rewards/tag_count_reward/std": 0.13687469065189362, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 170.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 92.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09934283519672271, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.247877065641731, + "kl": 5.96875, + "learning_rate": 1.982935153583618e-05, + "loss": 0.2386, + "num_tokens": 145047890.0, + "reward": 0.21484375, + "reward_std": 0.20817101001739502, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.2109375, + "rewards/tag_count_reward/std": 0.2003367394208908, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0995135273534181, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.422976965249122, + "kl": 2.86328125, + "learning_rate": 1.9863481228668943e-05, + "loss": 0.1147, + "num_tokens": 145611570.0, + "reward": 0.65234375, + "reward_std": 0.43037617206573486, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.44921875, + "rewards/format_reward/std": 0.49838894605636597, + "rewards/tag_count_reward/mean": 0.203125, + "rewards/tag_count_reward/std": 0.23999592661857605, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09968421951011351, + "frac_reward_zero_std": 0.0625, + "grad_norm": 5.403984499004734, + "kl": 1.080078125, + "learning_rate": 1.989761092150171e-05, + "loss": 0.0432, + "num_tokens": 146177618.0, + "reward": 0.8701171875, + "reward_std": 0.32676321268081665, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84765625, + "rewards/format_reward/std": 0.3600577116012573, + "rewards/tag_count_reward/mean": 0.0224609375, + "rewards/tag_count_reward/std": 0.09514384716749191, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0998549116668089, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0195811104703156, + "kl": 0.7333984375, + "learning_rate": 1.993174061433447e-05, + "loss": 0.0293, + "num_tokens": 146739602.0, + "reward": 0.6708984375, + "reward_std": 0.45737046003341675, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6640625, + "rewards/format_reward/std": 0.4732423722743988, + "rewards/tag_count_reward/mean": 0.0068359375, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10002560382350431, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5938386335962285, + "kl": 0.76171875, + "learning_rate": 1.9965870307167238e-05, + "loss": 0.0305, + "num_tokens": 147307858.0, + "reward": 0.34375, + "reward_std": 0.43537554144859314, + "rewards/accuracy_reward/mean": 0.00390625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.2734375, + "rewards/format_reward/std": 0.446596622467041, + "rewards/tag_count_reward/mean": 0.06640625, + "rewards/tag_count_reward/std": 0.13278456032276154, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1001962959801997, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.026469147562722, + "kl": 0.4658203125, + "learning_rate": 2e-05, + "loss": 0.0186, + "num_tokens": 147867970.0, + "reward": 0.3916015625, + "reward_std": 0.3303419053554535, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03515625, + "rewards/format_reward/std": 0.18453538417816162, + "rewards/tag_count_reward/mean": 0.3564453125, + "rewards/tag_count_reward/std": 0.2546575367450714, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10036698813689511, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0505813111727473, + "kl": 0.52392578125, + "learning_rate": 1.9999998225180493e-05, + "loss": 0.0209, + "num_tokens": 148439362.0, + "reward": 0.9951171875, + "reward_std": 0.3674711585044861, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.80078125, + "rewards/format_reward/std": 0.40019527077674866, + "rewards/tag_count_reward/mean": 0.1943359375, + "rewards/tag_count_reward/std": 0.244703009724617, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1005376802935905, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7243541433909372, + "kl": 0.3076171875, + "learning_rate": 1.999999290072259e-05, + "loss": 0.0123, + "num_tokens": 148997218.0, + "reward": 0.9931640625, + "reward_std": 0.5797876715660095, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.48828125, + "rewards/format_reward/std": 0.5008418560028076, + "rewards/tag_count_reward/mean": 0.5048828125, + "rewards/tag_count_reward/std": 0.2670198380947113, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10070837245028591, + "frac_reward_zero_std": 0.0, + "grad_norm": 30.270011589675573, + "kl": 0.923828125, + "learning_rate": 1.999998402662819e-05, + "loss": 0.0369, + "num_tokens": 149567346.0, + "reward": 1.365234375, + "reward_std": 0.44275936484336853, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.08821486681699753, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31272050738334656, + "rewards/tag_count_reward/mean": 0.466796875, + "rewards/tag_count_reward/std": 0.32293248176574707, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1008790646069813, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.5322114309804755, + "kl": 0.65625, + "learning_rate": 1.9999971602900436e-05, + "loss": 0.0263, + "num_tokens": 150129698.0, + "reward": 0.6923828125, + "reward_std": 0.507875382900238, + "rewards/accuracy_reward/mean": 0.00390625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.3984375, + "rewards/format_reward/std": 0.4905354380607605, + "rewards/tag_count_reward/mean": 0.2900390625, + "rewards/tag_count_reward/std": 0.27583813667297363, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10104975676367671, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.22171308371477, + "kl": 1.490234375, + "learning_rate": 1.999995562954374e-05, + "loss": 0.0596, + "num_tokens": 150693314.0, + "reward": 1.3935546875, + "reward_std": 0.5965972542762756, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.70703125, + "rewards/format_reward/std": 0.45601576566696167, + "rewards/tag_count_reward/mean": 0.6630859375, + "rewards/tag_count_reward/std": 0.32692065834999084, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1012204489203721, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.619678695340593, + "kl": 2.234375, + "learning_rate": 1.999993610656378e-05, + "loss": 0.0893, + "num_tokens": 151258162.0, + "reward": 1.75, + "reward_std": 0.4323127567768097, + "rewards/accuracy_reward/mean": 0.00390625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31272050738334656, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.23349303007125854, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1807.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10139114107706751, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0522870270572335, + "kl": 3.15625, + "learning_rate": 1.999991303396747e-05, + "loss": 0.1264, + "num_tokens": 151763842.0, + "reward": 0.65234375, + "reward_std": 0.39261165261268616, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.2920515835285187, + "rewards/tag_count_reward/mean": 0.515625, + "rewards/tag_count_reward/std": 0.27874016761779785, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 177.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1015618332337629, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.705673693952454, + "kl": 5.7578125, + "learning_rate": 1.9999886411763017e-05, + "loss": 0.2305, + "num_tokens": 151848770.0, + "reward": 1.8359375, + "reward_std": 0.45968204736709595, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.33136674761772156, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.14497295022010803, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 249.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10173252539045831, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.6329027671355683, + "kl": 4.734375, + "learning_rate": 1.999985623995986e-05, + "loss": 0.1888, + "num_tokens": 151952194.0, + "reward": 1.9755859375, + "reward_std": 0.22201912105083466, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.9453125, + "rewards/format_reward/std": 0.22781464457511902, + "rewards/tag_count_reward/mean": 0.9794921875, + "rewards/tag_count_reward/std": 0.08755578845739365, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 893.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 407.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1019032175471537, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.4164999832690723, + "kl": 2.99609375, + "learning_rate": 1.9999822518568713e-05, + "loss": 0.1199, + "num_tokens": 152227346.0, + "reward": 1.9931640625, + "reward_std": 0.26965394616127014, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.94921875, + "rewards/format_reward/std": 0.21998079121112823, + "rewards/tag_count_reward/mean": 0.9775390625, + "rewards/tag_count_reward/std": 0.10258138179779053, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1439.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1076.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 815.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10207390970384911, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.4016013523368194, + "kl": 2.6953125, + "learning_rate": 1.999978524760154e-05, + "loss": 0.1078, + "num_tokens": 152542450.0, + "reward": 2.0166015625, + "reward_std": 0.13914240896701813, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17433346807956696, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9931640625, + "rewards/tag_count_reward/std": 0.06789661198854446, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1379.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1039.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1022446018605445, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.8135141783619095, + "kl": 2.73046875, + "learning_rate": 1.999974442707158e-05, + "loss": 0.1093, + "num_tokens": 152934290.0, + "reward": 1.9638671875, + "reward_std": 0.18788401782512665, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9755859375, + "rewards/tag_count_reward/std": 0.11349894106388092, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1412.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 994.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10241529401723991, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8820596466223545, + "kl": 2.6875, + "learning_rate": 1.999970005699332e-05, + "loss": 0.1076, + "num_tokens": 153344002.0, + "reward": 1.9853515625, + "reward_std": 0.1608453392982483, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9814453125, + "rewards/tag_count_reward/std": 0.09850580990314484, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1568.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1160.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10258598617393531, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.5945670911844204, + "kl": 2.4765625, + "learning_rate": 1.9999652137382506e-05, + "loss": 0.0991, + "num_tokens": 153786274.0, + "reward": 2.03125, + "reward_std": 0.1948685199022293, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.24947863817214966, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11042476445436478, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1480.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1161.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10275667833063071, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.9638667097005896, + "kl": 2.0390625, + "learning_rate": 1.9999600668256148e-05, + "loss": 0.0814, + "num_tokens": 154204594.0, + "reward": 2.04296875, + "reward_std": 0.2116868495941162, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.07248647511005402, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1431.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 919.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10292737048732611, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.3569124335378783, + "kl": 1.755859375, + "learning_rate": 1.9999545649632522e-05, + "loss": 0.0703, + "num_tokens": 154609234.0, + "reward": 1.994140625, + "reward_std": 0.2061580866575241, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.18453538417816162, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17433346807956696, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.06185327470302582, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1342.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 858.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10309806264402151, + "frac_reward_zero_std": 0.625, + "grad_norm": 7.5346712778971225, + "kl": 2.35546875, + "learning_rate": 1.9999487081531148e-05, + "loss": 0.0942, + "num_tokens": 154998162.0, + "reward": 1.9921875, + "reward_std": 0.14003103971481323, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15158477425575256, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.048884619027376175, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1807.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1070.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10326875480071691, + "frac_reward_zero_std": 0.0625, + "grad_norm": 17.325308230379772, + "kl": 2.05859375, + "learning_rate": 1.9999424963972824e-05, + "loss": 0.0823, + "num_tokens": 155518642.0, + "reward": 1.880859375, + "reward_std": 0.3810778856277466, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2920515835285187, + "rewards/tag_count_reward/mean": 0.958984375, + "rewards/tag_count_reward/std": 0.12825122475624084, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1505.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 831.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 572.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1034394469574123, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.5237705909515646, + "kl": 1.515625, + "learning_rate": 1.9999359296979595e-05, + "loss": 0.0606, + "num_tokens": 155772386.0, + "reward": 1.9755859375, + "reward_std": 0.13241654634475708, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.08821486681699753, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.05999100208282471, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 723.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 574.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10361013911410771, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.7800960112566004, + "kl": 1.642578125, + "learning_rate": 1.9999290080574774e-05, + "loss": 0.0657, + "num_tokens": 156000002.0, + "reward": 1.9814453125, + "reward_std": 0.17850694060325623, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.13865381479263306, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9892578125, + "rewards/tag_count_reward/std": 0.06364604830741882, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 588.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 467.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1037808312708031, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.0170904473534226, + "kl": 1.953125, + "learning_rate": 1.999921731478293e-05, + "loss": 0.0782, + "num_tokens": 156189874.0, + "reward": 2.013671875, + "reward_std": 0.17278403043746948, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.08170124143362045, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 552.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 408.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10395152342749851, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.6939640524891098, + "kl": 1.69921875, + "learning_rate": 1.9999140999629882e-05, + "loss": 0.068, + "num_tokens": 156366530.0, + "reward": 1.97265625, + "reward_std": 0.15949726104736328, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.08821486681699753, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.06902241706848145, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 701.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 572.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1041222155841939, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.04845788772548, + "kl": 1.560546875, + "learning_rate": 1.9999061135142734e-05, + "loss": 0.0625, + "num_tokens": 156590082.0, + "reward": 2.048828125, + "reward_std": 0.11966310441493988, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.03789619356393814, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 601.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 476.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10429290774088931, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.34775872378463274, + "kl": 1.638671875, + "learning_rate": 1.9998977721349826e-05, + "loss": 0.0655, + "num_tokens": 156787138.0, + "reward": 2.0234375, + "reward_std": 0.11595594137907028, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.03106563352048397, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 759.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 586.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1044635998975847, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.4373331750737919, + "kl": 1.61328125, + "learning_rate": 1.9998890758280773e-05, + "loss": 0.0644, + "num_tokens": 157032994.0, + "reward": 2.0625, + "reward_std": 0.23183366656303406, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.07579238712787628, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1210.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 812.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 595.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10463429205428011, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.991786549739101, + "kl": 1.720703125, + "learning_rate": 1.9998800245966434e-05, + "loss": 0.0688, + "num_tokens": 157278962.0, + "reward": 2.05859375, + "reward_std": 0.12679657340049744, + "rewards/accuracy_reward/mean": 0.07500000298023224, + "rewards/accuracy_reward/std": 0.26394179463386536, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.03106563352048397, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1093.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 853.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 634.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1048049842109755, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0951241373627267, + "kl": 1.501953125, + "learning_rate": 1.9998706184438946e-05, + "loss": 0.06, + "num_tokens": 157537378.0, + "reward": 2.0205078125, + "reward_std": 0.09226740896701813, + "rewards/accuracy_reward/mean": 0.02916666679084301, + "rewards/accuracy_reward/std": 0.16862517595291138, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 929.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 656.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10497567636767091, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.1976131417865012, + "kl": 1.73046875, + "learning_rate": 1.9998608573731696e-05, + "loss": 0.0692, + "num_tokens": 157817938.0, + "reward": 2.14453125, + "reward_std": 0.15458697080612183, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3638034462928772, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1019.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 847.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1051463685243663, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6173321784687369, + "kl": 1.404296875, + "learning_rate": 1.9998507413879333e-05, + "loss": 0.0562, + "num_tokens": 158121826.0, + "reward": 2.0439453125, + "reward_std": 0.09556656330823898, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1226.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 914.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 702.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10531706068106171, + "frac_reward_zero_std": 0.5625, + "grad_norm": 6.597473764000967, + "kl": 1.818359375, + "learning_rate": 1.999840270491776e-05, + "loss": 0.0728, + "num_tokens": 158402530.0, + "reward": 2.0693359375, + "reward_std": 0.16609402000904083, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2626400291919708, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 831.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 693.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1054877528377571, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.295037246893132, + "kl": 2.16796875, + "learning_rate": 1.9998294446884152e-05, + "loss": 0.0868, + "num_tokens": 158651346.0, + "reward": 2.0263671875, + "reward_std": 0.15928076207637787, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.041130900382995605, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 802.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 642.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10565844499445251, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5033808327559887, + "kl": 1.42578125, + "learning_rate": 1.999818263981693e-05, + "loss": 0.057, + "num_tokens": 158897890.0, + "reward": 2.0185546875, + "reward_std": 0.10642585158348083, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17433346807956696, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.05169277638196945, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 849.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 674.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1058291371511479, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.103617046314391, + "kl": 2.625, + "learning_rate": 1.9998067283755787e-05, + "loss": 0.1045, + "num_tokens": 159152306.0, + "reward": 2.080078125, + "reward_std": 0.193659707903862, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.0539139099419117, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 863.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 753.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10599982930784331, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7231112498380972, + "kl": 1.587890625, + "learning_rate": 1.9997948378741663e-05, + "loss": 0.0635, + "num_tokens": 159417842.0, + "reward": 2.0078125, + "reward_std": 0.09341736882925034, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.13865381479263306, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.04935242608189583, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 961.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 820.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1061705214645387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5179943529174719, + "kl": 2.189453125, + "learning_rate": 1.9997825924816774e-05, + "loss": 0.0876, + "num_tokens": 159704242.0, + "reward": 2.048828125, + "reward_std": 0.16345569491386414, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26889389753341675, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.07277647405862808, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1175.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 909.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10634121362123411, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.1004962650684558, + "kl": 1.78125, + "learning_rate": 1.9997699922024583e-05, + "loss": 0.0712, + "num_tokens": 160044610.0, + "reward": 2.0224609375, + "reward_std": 0.1542200893163681, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21178513765335083, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9912109375, + "rewards/tag_count_reward/std": 0.0808708667755127, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1614.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1147.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 726.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1065119057779295, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8608086613710672, + "kl": 2.16015625, + "learning_rate": 1.9997570370409813e-05, + "loss": 0.0864, + "num_tokens": 160378946.0, + "reward": 1.978515625, + "reward_std": 0.11341102421283722, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.08821486681699753, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.03789619356393814, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1040.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 786.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10668259793462491, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.5015059909578692, + "kl": 1.482421875, + "learning_rate": 1.9997437270018454e-05, + "loss": 0.0593, + "num_tokens": 160684050.0, + "reward": 2.078125, + "reward_std": 0.18801969289779663, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.05805254727602005, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1651.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1301.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1068532900913203, + "frac_reward_zero_std": 0.125, + "grad_norm": 5.729035759568998, + "kl": 1.5, + "learning_rate": 1.999730062089775e-05, + "loss": 0.06, + "num_tokens": 161056306.0, + "reward": 2.0029296875, + "reward_std": 0.37517932057380676, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31272050738334656, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24253563582897186, + "rewards/tag_count_reward/mean": 0.9560546875, + "rewards/tag_count_reward/std": 0.14441382884979248, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1792.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10702398224801571, + "frac_reward_zero_std": 0.375, + "grad_norm": 15.90734432812293, + "kl": 2.5625, + "learning_rate": 1.9997160423096213e-05, + "loss": 0.1024, + "num_tokens": 161560898.0, + "reward": 1.798828125, + "reward_std": 0.4060233235359192, + "rewards/accuracy_reward/mean": 0.02734375, + "rewards/accuracy_reward/std": 0.1634024828672409, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3483152687549591, + "rewards/tag_count_reward/mean": 0.912109375, + "rewards/tag_count_reward/std": 0.21600164473056793, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2005.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1711.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1071946744047111, + "frac_reward_zero_std": 0.0625, + "grad_norm": 32.03094548342529, + "kl": 3.62890625, + "learning_rate": 1.9997016676663595e-05, + "loss": 0.1453, + "num_tokens": 162113218.0, + "reward": 1.7529296875, + "reward_std": 0.5580176711082458, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3780108094215393, + "rewards/tag_count_reward/mean": 0.8857421875, + "rewards/tag_count_reward/std": 0.24531260132789612, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1450.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 912.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10736536656140651, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.7575214216848973, + "kl": 1.421875, + "learning_rate": 1.9996869381650935e-05, + "loss": 0.0568, + "num_tokens": 162523506.0, + "reward": 1.9853515625, + "reward_std": 0.18764862418174744, + "rewards/accuracy_reward/mean": 0.03333333507180214, + "rewards/accuracy_reward/std": 0.17988063395023346, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.9814453125, + "rewards/tag_count_reward/std": 0.09850580990314484, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1646.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1208.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 776.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1075360587181019, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.457326472602942, + "kl": 1.763671875, + "learning_rate": 1.9996718538110508e-05, + "loss": 0.0707, + "num_tokens": 162879410.0, + "reward": 2.0703125, + "reward_std": 0.11971627175807953, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2561737895011902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1071.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 832.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10770675087479731, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.9906909144687434, + "kl": 1.81640625, + "learning_rate": 1.999656414609586e-05, + "loss": 0.0726, + "num_tokens": 163197314.0, + "reward": 2.1630859375, + "reward_std": 0.179524227976799, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.3745708465576172, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1168.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 991.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 825.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1078774430314927, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.9975656871474883, + "kl": 1.64453125, + "learning_rate": 1.9996406205661792e-05, + "loss": 0.0657, + "num_tokens": 163489314.0, + "reward": 2.0478515625, + "reward_std": 0.06170108914375305, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22781464457511902, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 978.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 756.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10804813518818811, + "frac_reward_zero_std": 0.625, + "grad_norm": 831622.8426094253, + "kl": 40528.0, + "learning_rate": 1.9996244716864373e-05, + "loss": 1630.1174, + "num_tokens": 163780930.0, + "reward": 2.1005859375, + "reward_std": 0.12549859285354614, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 817.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 601.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1082188273448835, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.44113322133191, + "kl": 1.759765625, + "learning_rate": 1.9996079679760927e-05, + "loss": 0.0704, + "num_tokens": 164029986.0, + "reward": 2.1640625, + "reward_std": 0.16218778491020203, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.3710577189922333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 820.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 581.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10838951950157891, + "frac_reward_zero_std": 0.625, + "grad_norm": 168249.59680867183, + "kl": 17608.0, + "learning_rate": 1.999591109441003e-05, + "loss": 704.5712, + "num_tokens": 164276658.0, + "reward": 2.13671875, + "reward_std": 0.1679086685180664, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.34422317147254944, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 811.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 565.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1085602116582743, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.3246243885955278, + "kl": 1.75, + "learning_rate": 1.9995738960871525e-05, + "loss": 0.07, + "num_tokens": 164524594.0, + "reward": 2.05859375, + "reward_std": 0.03697281330823898, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24256734549999237, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1184.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 944.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 619.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1087309038149697, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.3156854766742714, + "kl": 1.65625, + "learning_rate": 1.999556327920652e-05, + "loss": 0.0662, + "num_tokens": 164804930.0, + "reward": 1.998046875, + "reward_std": 0.0078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.03125, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 740.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 421.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1089015959716651, + "frac_reward_zero_std": 0.625, + "grad_norm": 11147.116334782952, + "kl": 1127.0, + "learning_rate": 1.999538404947736e-05, + "loss": 44.9438, + "num_tokens": 165032530.0, + "reward": 2.119140625, + "reward_std": 0.11652223765850067, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 860.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 682.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1090722881283605, + "frac_reward_zero_std": 0.625, + "grad_norm": 12787.074647821402, + "kl": 1362.875, + "learning_rate": 1.9995201271747682e-05, + "loss": 54.3481, + "num_tokens": 165295554.0, + "reward": 2.115234375, + "reward_std": 0.09132768213748932, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3268752694129944, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 841.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 611.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1092429802850559, + "frac_reward_zero_std": 0.8125, + "grad_norm": 5631.144827058771, + "kl": 594.0, + "learning_rate": 1.9995014946082357e-05, + "loss": 23.8468, + "num_tokens": 165556386.0, + "reward": 2.01953125, + "reward_std": 0.058320626616477966, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.13865381479263306, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 750.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 646.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1094136724417513, + "frac_reward_zero_std": 0.5625, + "grad_norm": 727.0762338503446, + "kl": 80.078125, + "learning_rate": 1.9994825072547527e-05, + "loss": 3.1871, + "num_tokens": 165787570.0, + "reward": 2.0888671875, + "reward_std": 0.15729619562625885, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.06436405330896378, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 648.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 537.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1095843645984467, + "frac_reward_zero_std": 0.6875, + "grad_norm": 160.01241770285074, + "kl": 19.33203125, + "learning_rate": 1.9994631651210586e-05, + "loss": 0.7713, + "num_tokens": 165995074.0, + "reward": 2.0888671875, + "reward_std": 0.11279661953449249, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2920515835285187, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9990234375, + "rewards/tag_count_reward/std": 0.015625, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 840.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 593.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1097550567551421, + "frac_reward_zero_std": 0.5625, + "grad_norm": 475.66350250518644, + "kl": 55.25, + "learning_rate": 1.9994434682140196e-05, + "loss": 2.211, + "num_tokens": 166248546.0, + "reward": 2.091796875, + "reward_std": 0.150947704911232, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.04388983175158501, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 956.3125, + "completions/mean_terminated_length": 609.0, + "completions/min_length": 518.0, + "completions/min_terminated_length": 609.0, + "epoch": 0.1099257489118375, + "frac_reward_zero_std": 0.4375, + "grad_norm": 163.9303312561827, + "kl": 19.59375, + "learning_rate": 1.9994234165406272e-05, + "loss": 0.7807, + "num_tokens": 166532130.0, + "reward": 1.978515625, + "reward_std": 0.12561500072479248, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.08821486681699753, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.978515625, + "rewards/tag_count_reward/std": 0.11727019399404526, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 698.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 449.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1100964410685329, + "frac_reward_zero_std": 0.625, + "grad_norm": 3.2683307993891053, + "kl": 1.357421875, + "learning_rate": 1.999403010107999e-05, + "loss": 0.0542, + "num_tokens": 166749826.0, + "reward": 2.0927734375, + "reward_std": 0.1274576038122177, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29743078351020813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9951171875, + "rewards/tag_count_reward/std": 0.056234680116176605, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1015.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 488.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1102671332252283, + "frac_reward_zero_std": 0.6875, + "grad_norm": 33.298103775232555, + "kl": 4.9921875, + "learning_rate": 1.9993822489233784e-05, + "loss": 0.1994, + "num_tokens": 167054978.0, + "reward": 2.0166015625, + "reward_std": 0.05529459938406944, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.13865381479263306, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 747.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 626.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1104378253819237, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.42821616436394616, + "kl": 1.791015625, + "learning_rate": 1.9993611329941354e-05, + "loss": 0.0717, + "num_tokens": 167295298.0, + "reward": 2.009765625, + "reward_std": 0.07281509041786194, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.994140625, + "rewards/tag_count_reward/std": 0.04915804788470268, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 743.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 661.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1106085175386191, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.828745866695914, + "kl": 2.02734375, + "learning_rate": 1.9993396623277647e-05, + "loss": 0.0812, + "num_tokens": 167532690.0, + "reward": 2.0625, + "reward_std": 0.09814241528511047, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24253563582897186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 810.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 630.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1107792096953145, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.2736166399624889, + "kl": 1.470703125, + "learning_rate": 1.9993178369318884e-05, + "loss": 0.0588, + "num_tokens": 167778450.0, + "reward": 2.185546875, + "reward_std": 0.23547902703285217, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.39417871832847595, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 785.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 621.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1109499018520099, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4130970779832564, + "kl": 1.203125, + "learning_rate": 1.999295656814253e-05, + "loss": 0.048, + "num_tokens": 168015714.0, + "reward": 2.033203125, + "reward_std": 0.08263835310935974, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.19412322342395782, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.998046875, + "rewards/tag_count_reward/std": 0.022053716704249382, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1140.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 694.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1111205940087053, + "frac_reward_zero_std": 0.625, + "grad_norm": 223.5541028373399, + "kl": 14.8125, + "learning_rate": 1.999273121982732e-05, + "loss": 0.595, + "num_tokens": 168347730.0, + "reward": 2.0439453125, + "reward_std": 0.12268427014350891, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21998079121112823, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1001.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 646.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1112912861654007, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.66009565096465, + "kl": 1.6015625, + "learning_rate": 1.9992502324453244e-05, + "loss": 0.064, + "num_tokens": 168644162.0, + "reward": 2.0751953125, + "reward_std": 0.09465862810611725, + "rewards/accuracy_reward/mean": 0.08203125, + "rewards/accuracy_reward/std": 0.2749498784542084, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.0468750037252903, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 961.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 822.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1114619783220961, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.1013727568185234, + "kl": 1.625, + "learning_rate": 1.999226988210155e-05, + "loss": 0.0649, + "num_tokens": 168927746.0, + "reward": 2.0908203125, + "reward_std": 0.13530129194259644, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "rewards/tag_count_reward/mean": 0.9970703125, + "rewards/tag_count_reward/std": 0.02695695497095585, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 907.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 728.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1116326704787915, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.4043672459538148, + "kl": 2.53125, + "learning_rate": 1.999203389285475e-05, + "loss": 0.1012, + "num_tokens": 169202482.0, + "reward": 2.0712890625, + "reward_std": 0.21296173334121704, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3026638329029083, + "rewards/format_reward/mean": 0.98046875, + "rewards/format_reward/std": 0.13865381479263306, + "rewards/tag_count_reward/mean": 0.9892578125, + "rewards/tag_count_reward/std": 0.06364604830741882, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 994.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 893.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1118033626354869, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.1338314795850815, + "kl": 2.265625, + "learning_rate": 1.999179435679661e-05, + "loss": 0.0906, + "num_tokens": 169496626.0, + "reward": 2.068359375, + "reward_std": 0.24933139979839325, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3077581524848938, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.1634024828672409, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.06185327470302582, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 952.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 715.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1119740547921823, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4224285679303366, + "kl": 2.140625, + "learning_rate": 1.9991551274012156e-05, + "loss": 0.0857, + "num_tokens": 169781570.0, + "reward": 2.03125, + "reward_std": 0.3239101767539978, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.31755712628364563, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24253563582897186, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.07739239931106567, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 895.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 572.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1121447469488777, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.62651545835415, + "kl": 3.31640625, + "learning_rate": 1.9991304644587672e-05, + "loss": 0.1327, + "num_tokens": 170051010.0, + "reward": 1.228515625, + "reward_std": 0.6554781794548035, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.3222736418247223, + "rewards/format_reward/mean": 0.35546875, + "rewards/format_reward/std": 0.4795927405357361, + "rewards/tag_count_reward/mean": 0.755859375, + "rewards/tag_count_reward/std": 0.2484557181596756, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 966.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 809.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1123154391055731, + "frac_reward_zero_std": 0.0, + "grad_norm": 521.785386107018, + "kl": 49.0625, + "learning_rate": 1.9991054468610705e-05, + "loss": 1.9628, + "num_tokens": 170336258.0, + "reward": 1.3740234375, + "reward_std": 0.686059832572937, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.20318391919136047, + "rewards/format_reward/mean": 0.51171875, + "rewards/format_reward/std": 0.5008418560028076, + "rewards/tag_count_reward/mean": 0.8193359375, + "rewards/tag_count_reward/std": 0.22972095012664795, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 969.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 901.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 779.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1124861312622685, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.372987468340963, + "kl": 2.265625, + "learning_rate": 1.999080074617006e-05, + "loss": 0.0908, + "num_tokens": 170610866.0, + "reward": 0.966796875, + "reward_std": 0.5930416584014893, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12426253408193588, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42443734407424927, + "rewards/tag_count_reward/mean": 0.716796875, + "rewards/tag_count_reward/std": 0.2371627539396286, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 789.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 613.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1126568234189639, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.042254827310002, + "kl": 2.50390625, + "learning_rate": 1.9990543477355797e-05, + "loss": 0.1002, + "num_tokens": 170851490.0, + "reward": 0.8193359375, + "reward_std": 0.4037613868713379, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.08821486681699753, + "rewards/format_reward/mean": 0.09765625, + "rewards/format_reward/std": 0.29743078351020813, + "rewards/tag_count_reward/mean": 0.7138671875, + "rewards/tag_count_reward/std": 0.16393771767616272, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 596.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 476.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1128275155756593, + "frac_reward_zero_std": 0.0, + "grad_norm": 86.69051753448305, + "kl": 9.046875, + "learning_rate": 1.9990282662259237e-05, + "loss": 0.3616, + "num_tokens": 171044914.0, + "reward": 1.0791015625, + "reward_std": 0.690174400806427, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.1078278198838234, + "rewards/format_reward/mean": 0.35546875, + "rewards/format_reward/std": 0.4795927405357361, + "rewards/tag_count_reward/mean": 0.7119140625, + "rewards/tag_count_reward/std": 0.26432424783706665, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 336.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 205.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1129982077323547, + "frac_reward_zero_std": 0.1875, + "grad_norm": 346.10557803981203, + "kl": 14.8125, + "learning_rate": 1.999001830097296e-05, + "loss": 0.5928, + "num_tokens": 171178546.0, + "reward": 0.06640625, + "reward_std": 0.13596011698246002, + "rewards/accuracy_reward/mean": 0.00390625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0625, + "rewards/tag_count_reward/std": 0.15965649485588074, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1131688998890501, + "frac_reward_zero_std": 0.875, + "grad_norm": 156.45631247499506, + "kl": 17.96875, + "learning_rate": 1.998975039359081e-05, + "loss": 0.7177, + "num_tokens": 171288210.0, + "reward": 0.00390625, + "reward_std": 0.012404775246977806, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.00390625, + "rewards/tag_count_reward/std": 0.03814799711108208, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 41.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1133395920457455, + "frac_reward_zero_std": 1.0, + "grad_norm": 63.189309986424234, + "kl": 13.9375, + "learning_rate": 1.998947894020787e-05, + "loss": 0.5575, + "num_tokens": 171341570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 77.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 52.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1135102842024409, + "frac_reward_zero_std": 1.0, + "grad_norm": 43.01760976200888, + "kl": 11.328125, + "learning_rate": 1.998920394092051e-05, + "loss": 0.4531, + "num_tokens": 171402306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 65.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 48.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1136809763591363, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.786282739729286, + "kl": 11.875, + "learning_rate": 1.9988925395826343e-05, + "loss": 0.4752, + "num_tokens": 171461746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1138516685158317, + "frac_reward_zero_std": 1.0, + "grad_norm": 401.4556804300158, + "kl": 7.625, + "learning_rate": 1.9988643305024236e-05, + "loss": 0.3056, + "num_tokens": 171530594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 116.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 66.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11402236067252709, + "frac_reward_zero_std": 1.0, + "grad_norm": 35.60633615502205, + "kl": 9.015625, + "learning_rate": 1.998835766861433e-05, + "loss": 0.3606, + "num_tokens": 171601826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 116.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 89.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1141930528292225, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.567693777344727, + "kl": 7.484375, + "learning_rate": 1.9988068486698008e-05, + "loss": 0.2995, + "num_tokens": 171668754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 172.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11436374498591789, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.4861120224177995, + "kl": 6.0390625, + "learning_rate": 1.9987775759377923e-05, + "loss": 0.2413, + "num_tokens": 171758258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 189.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 153.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1145344371426133, + "frac_reward_zero_std": 1.0, + "grad_norm": 81.12182409186427, + "kl": 6.484375, + "learning_rate": 1.998747948675798e-05, + "loss": 0.2593, + "num_tokens": 171844930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 148.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11470512929930869, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4303823128664883, + "kl": 7.8984375, + "learning_rate": 1.9987179668943345e-05, + "loss": 0.3156, + "num_tokens": 171924450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 148.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 94.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1148758214560041, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.271499786197702, + "kl": 7.15625, + "learning_rate": 1.9986876306040445e-05, + "loss": 0.2862, + "num_tokens": 172004946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 121.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11504651361269949, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.990490643717624, + "kl": 7.3203125, + "learning_rate": 1.9986569398156962e-05, + "loss": 0.2926, + "num_tokens": 172075922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 148.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1152172057693949, + "frac_reward_zero_std": 1.0, + "grad_norm": 10.949351821222196, + "kl": 6.3671875, + "learning_rate": 1.998625894540184e-05, + "loss": 0.2546, + "num_tokens": 172152738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 183.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11538789792609029, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.72845393579553, + "kl": 4.8125, + "learning_rate": 1.998594494788527e-05, + "loss": 0.1929, + "num_tokens": 172252162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 74.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1155585900827857, + "frac_reward_zero_std": 1.0, + "grad_norm": 86524.48750700391, + "kl": 3416.0, + "learning_rate": 1.998562740571872e-05, + "loss": 136.5162, + "num_tokens": 172322466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 84.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 63.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11572928223948109, + "frac_reward_zero_std": 1.0, + "grad_norm": 41.34905620728263, + "kl": 6.078125, + "learning_rate": 1.9985306319014898e-05, + "loss": 0.2435, + "num_tokens": 172391298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1158999743961765, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.362784791480364, + "kl": 9.671875, + "learning_rate": 1.9984981687887783e-05, + "loss": 0.3871, + "num_tokens": 172439890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 62.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11607066655287189, + "frac_reward_zero_std": 1.0, + "grad_norm": 21.022031085505365, + "kl": 9.03125, + "learning_rate": 1.998465351245261e-05, + "loss": 0.3614, + "num_tokens": 172497250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 77.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 46.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1162413587095673, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.325724993919903, + "kl": 6.7890625, + "learning_rate": 1.998432179282586e-05, + "loss": 0.2717, + "num_tokens": 172556194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 111.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 68.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11641205086626269, + "frac_reward_zero_std": 1.0, + "grad_norm": 2626.940190511334, + "kl": 274.1875, + "learning_rate": 1.998398652912529e-05, + "loss": 10.9251, + "num_tokens": 172630530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 116.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 65.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1165827430229581, + "frac_reward_zero_std": 1.0, + "grad_norm": 2091.6853782839894, + "kl": 254.5, + "learning_rate": 1.99836477214699e-05, + "loss": 10.1764, + "num_tokens": 172696130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 67.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11675343517965349, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.937685739336761, + "kl": 6.453125, + "learning_rate": 1.9983305369979962e-05, + "loss": 0.2578, + "num_tokens": 172759074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 75.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 50.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1169241273363489, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.574076015035018, + "kl": 8.140625, + "learning_rate": 1.9982959474776993e-05, + "loss": 0.3259, + "num_tokens": 172815042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 69.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11709481949304429, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.419199168223266, + "kl": 9.640625, + "learning_rate": 1.998261003598377e-05, + "loss": 0.3853, + "num_tokens": 172870498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1172655116497397, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.086991969120232, + "kl": 10.015625, + "learning_rate": 1.998225705372434e-05, + "loss": 0.4, + "num_tokens": 172923138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 61.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11743620380643509, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.7853005385030314, + "kl": 8.34375, + "learning_rate": 1.9981900528123995e-05, + "loss": 0.3338, + "num_tokens": 172981698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 59.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1176068959631305, + "frac_reward_zero_std": 1.0, + "grad_norm": 163474.9520143093, + "kl": 10.765625, + "learning_rate": 1.998154045930929e-05, + "loss": 0.4309, + "num_tokens": 173042226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 61.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11777758811982589, + "frac_reward_zero_std": 1.0, + "grad_norm": 157868554187.15393, + "kl": 11403326.09375, + "learning_rate": 1.998117684740803e-05, + "loss": 455305.4688, + "num_tokens": 173097458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 69.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1179482802765213, + "frac_reward_zero_std": 1.0, + "grad_norm": 5047.1527506617, + "kl": 488.0, + "learning_rate": 1.998080969254929e-05, + "loss": 19.5165, + "num_tokens": 173163314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 49.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11811897243321669, + "frac_reward_zero_std": 1.0, + "grad_norm": 23.680175700115214, + "kl": 7.0546875, + "learning_rate": 1.9980438994863402e-05, + "loss": 0.2818, + "num_tokens": 173216834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1182896645899121, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.281685659100643, + "kl": 7.1015625, + "learning_rate": 1.9980064754481937e-05, + "loss": 0.2843, + "num_tokens": 173270306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 43.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11846035674660749, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.217809327377239, + "kl": 7.2109375, + "learning_rate": 1.9979686971537747e-05, + "loss": 0.2884, + "num_tokens": 173322018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 62.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1186310489033029, + "frac_reward_zero_std": 1.0, + "grad_norm": 14506.886119335873, + "kl": 90.34375, + "learning_rate": 1.9979305646164926e-05, + "loss": 3.631, + "num_tokens": 173377794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 53.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11880174105999829, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.151206204949538, + "kl": 5.015625, + "learning_rate": 1.9978920778498835e-05, + "loss": 0.2007, + "num_tokens": 173434626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 66.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1189724332166937, + "frac_reward_zero_std": 1.0, + "grad_norm": 71.71932757677831, + "kl": 10.671875, + "learning_rate": 1.9978532368676084e-05, + "loss": 0.4275, + "num_tokens": 173493954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 75.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11914312537338909, + "frac_reward_zero_std": 1.0, + "grad_norm": 25.994350583972384, + "kl": 7.203125, + "learning_rate": 1.997814041683455e-05, + "loss": 0.2884, + "num_tokens": 173553986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 98.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 53.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11931381753008449, + "frac_reward_zero_std": 1.0, + "grad_norm": 1866.9948151490837, + "kl": 5.4921875, + "learning_rate": 1.9977744923113356e-05, + "loss": 0.2195, + "num_tokens": 173619554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 81.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 49.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11948450968677989, + "frac_reward_zero_std": 1.0, + "grad_norm": 11747.640845234495, + "kl": 12.484375, + "learning_rate": 1.997734588765289e-05, + "loss": 0.4999, + "num_tokens": 173676450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 84.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 62.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11965520184347529, + "frac_reward_zero_std": 1.0, + "grad_norm": 11832.971185869905, + "kl": 8.578125, + "learning_rate": 1.997694331059479e-05, + "loss": 0.3436, + "num_tokens": 173740274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 123.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1198258940001707, + "frac_reward_zero_std": 1.0, + "grad_norm": 186391.2932627621, + "kl": 18.2109375, + "learning_rate": 1.9976537192081972e-05, + "loss": 0.727, + "num_tokens": 173814482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 120.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11999658615686609, + "frac_reward_zero_std": 1.0, + "grad_norm": 16147966.81771605, + "kl": 605.5234375, + "learning_rate": 1.9976127532258574e-05, + "loss": 24.173, + "num_tokens": 173880914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 94.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1201672783135615, + "frac_reward_zero_std": 1.0, + "grad_norm": 49954.85975061671, + "kl": 12.109375, + "learning_rate": 1.9975714331270026e-05, + "loss": 0.484, + "num_tokens": 173942466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 118.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12033797047025689, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.833855963652126, + "kl": 8.3125, + "learning_rate": 1.997529758926299e-05, + "loss": 0.3324, + "num_tokens": 174014370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 114.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 89.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1205086626269523, + "frac_reward_zero_std": 1.0, + "grad_norm": 1227.6303545056246, + "kl": 7.9140625, + "learning_rate": 1.9974877306385398e-05, + "loss": 0.3165, + "num_tokens": 174090626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 149.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12067935478364769, + "frac_reward_zero_std": 1.0, + "grad_norm": 228122.8523753201, + "kl": 21.0, + "learning_rate": 1.9974453482786437e-05, + "loss": 0.8402, + "num_tokens": 174170226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 165.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1208500469403431, + "frac_reward_zero_std": 1.0, + "grad_norm": 5309436372.68836, + "kl": 230719.5, + "learning_rate": 1.9974026118616542e-05, + "loss": 9237.083, + "num_tokens": 174284354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 534.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 423.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12102073909703849, + "frac_reward_zero_std": 1.0, + "grad_norm": 604006.0703507102, + "kl": 2338.0, + "learning_rate": 1.997359521402742e-05, + "loss": 93.5328, + "num_tokens": 174463618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 750.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 604.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1211914312537339, + "frac_reward_zero_std": 1.0, + "grad_norm": 68515.0896893517, + "kl": 6632.0, + "learning_rate": 1.9973160769172023e-05, + "loss": 265.6795, + "num_tokens": 174693586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 929.625, + "completions/mean_terminated_length": 1346.0, + "completions/min_length": 693.0, + "completions/min_terminated_length": 1346.0, + "epoch": 0.12136212341042929, + "frac_reward_zero_std": 1.0, + "grad_norm": 89767.68224558874, + "kl": 122.875, + "learning_rate": 1.9972722784204563e-05, + "loss": 4.917, + "num_tokens": 174972706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 767.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 634.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1215328155671247, + "frac_reward_zero_std": 1.0, + "grad_norm": 45.1930377839871, + "kl": 10.265625, + "learning_rate": 1.9972281259280505e-05, + "loss": 0.4108, + "num_tokens": 175214306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 767.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 691.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12170350772382009, + "frac_reward_zero_std": 1.0, + "grad_norm": 436332.7652411897, + "kl": 48.140625, + "learning_rate": 1.9971836194556583e-05, + "loss": 1.9233, + "num_tokens": 175452530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 511.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 380.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1218741998805155, + "frac_reward_zero_std": 1.0, + "grad_norm": 3258.896738321039, + "kl": 5.2421875, + "learning_rate": 1.9971387590190776e-05, + "loss": 0.2092, + "num_tokens": 175623394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 379.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 327.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12204489203721089, + "frac_reward_zero_std": 1.0, + "grad_norm": 15.78356148302318, + "kl": 5.546875, + "learning_rate": 1.9970935446342315e-05, + "loss": 0.2218, + "num_tokens": 175772594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 299.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 254.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1222155841939063, + "frac_reward_zero_std": 1.0, + "grad_norm": 97282.1263153396, + "kl": 12.03125, + "learning_rate": 1.9970479763171705e-05, + "loss": 0.4803, + "num_tokens": 175893634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 260.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 187.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12238627635060169, + "frac_reward_zero_std": 1.0, + "grad_norm": 376.05872090138706, + "kl": 5.5390625, + "learning_rate": 1.9970020540840696e-05, + "loss": 0.2216, + "num_tokens": 176001250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 166.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1225569685072971, + "frac_reward_zero_std": 1.0, + "grad_norm": 13592.562015530755, + "kl": 5.65625, + "learning_rate": 1.9969557779512287e-05, + "loss": 0.2265, + "num_tokens": 176105586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 179.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 113.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12272766066399249, + "frac_reward_zero_std": 1.0, + "grad_norm": 163677.50677958087, + "kl": 17.109375, + "learning_rate": 1.9969091479350745e-05, + "loss": 0.6834, + "num_tokens": 176196754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 125.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 84.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1228983528206879, + "frac_reward_zero_std": 1.0, + "grad_norm": 19819.18969230286, + "kl": 7.515625, + "learning_rate": 1.9968621640521596e-05, + "loss": 0.3003, + "num_tokens": 176272098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 101.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12306904497738329, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.5774038277525877, + "kl": 6.4609375, + "learning_rate": 1.996814826319161e-05, + "loss": 0.258, + "num_tokens": 176339234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 99.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1232397371340787, + "frac_reward_zero_std": 1.0, + "grad_norm": 579.5989966131724, + "kl": 4.8359375, + "learning_rate": 1.9967671347528822e-05, + "loss": 0.1932, + "num_tokens": 176410978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 89.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12341042929077409, + "frac_reward_zero_std": 1.0, + "grad_norm": 26.857885335415386, + "kl": 5.9453125, + "learning_rate": 1.996719089370251e-05, + "loss": 0.238, + "num_tokens": 176476578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 96.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1235811214474695, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.063359461074265, + "kl": 4.63671875, + "learning_rate": 1.9966706901883236e-05, + "loss": 0.1858, + "num_tokens": 176546738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 89.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 62.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12375181360416489, + "frac_reward_zero_std": 1.0, + "grad_norm": 1833.0757635541368, + "kl": 5.71875, + "learning_rate": 1.996621937224278e-05, + "loss": 0.229, + "num_tokens": 176610114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 102.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1239225057608603, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.055089487392919, + "kl": 6.3828125, + "learning_rate": 1.9965728304954213e-05, + "loss": 0.2553, + "num_tokens": 176686402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 86.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 68.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12409319791755569, + "frac_reward_zero_std": 1.0, + "grad_norm": 19822.248970614542, + "kl": 9.234375, + "learning_rate": 1.9965233700191837e-05, + "loss": 0.3689, + "num_tokens": 176746578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 134.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1242638900742511, + "frac_reward_zero_std": 1.0, + "grad_norm": 141302.59298193568, + "kl": 24.5546875, + "learning_rate": 1.9964735558131223e-05, + "loss": 0.9824, + "num_tokens": 176815970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 110.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 71.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12443458223094649, + "frac_reward_zero_std": 1.0, + "grad_norm": 507179.7593817417, + "kl": 35.3984375, + "learning_rate": 1.9964233878949194e-05, + "loss": 1.4188, + "num_tokens": 176884850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 162.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 118.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12460527438764189, + "frac_reward_zero_std": 1.0, + "grad_norm": 1708206.1006991193, + "kl": 146.0234375, + "learning_rate": 1.9963728662823823e-05, + "loss": 5.8297, + "num_tokens": 176967058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 239.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 138.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12477596654433729, + "frac_reward_zero_std": 1.0, + "grad_norm": 160814109.07361647, + "kl": 7463.9609375, + "learning_rate": 1.9963219909934448e-05, + "loss": 298.6902, + "num_tokens": 177069938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 512.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12494665870103269, + "frac_reward_zero_std": 1.0, + "grad_norm": 71354692.29828443, + "kl": 7944.6484375, + "learning_rate": 1.9962707620461653e-05, + "loss": 317.9077, + "num_tokens": 177241970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 544.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1251173508577281, + "frac_reward_zero_std": 1.0, + "grad_norm": 75.51867409543374, + "kl": 2.82421875, + "learning_rate": 1.9962191794587292e-05, + "loss": 0.1127, + "num_tokens": 177417058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 854.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 532.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12528804301442348, + "frac_reward_zero_std": 1.0, + "grad_norm": 127928.4743048524, + "kl": 6.10546875, + "learning_rate": 1.9961672432494456e-05, + "loss": 0.2448, + "num_tokens": 177683170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1884.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1265.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 795.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12545873517111888, + "frac_reward_zero_std": 1.0, + "grad_norm": 4010666.17829462, + "kl": 134.06640625, + "learning_rate": 1.99611495343675e-05, + "loss": 5.3713, + "num_tokens": 178045218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1697.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1401.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1256294273278143, + "frac_reward_zero_std": 1.0, + "grad_norm": 166.3934088064858, + "kl": 3.66796875, + "learning_rate": 1.996062310039204e-05, + "loss": 0.1467, + "num_tokens": 178519522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1762.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1417.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1258001194845097, + "frac_reward_zero_std": 1.0, + "grad_norm": 5667973810.59188, + "kl": 342014.0, + "learning_rate": 1.996009313075493e-05, + "loss": 13615.958, + "num_tokens": 179007762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1984.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1593.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12597081164120508, + "frac_reward_zero_std": 1.0, + "grad_norm": 1157237891.697607, + "kl": 1767297.046875, + "learning_rate": 1.9959559625644304e-05, + "loss": 70784.5391, + "num_tokens": 179556834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1174.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 712.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12614150379790048, + "frac_reward_zero_std": 1.0, + "grad_norm": 5824108.885410896, + "kl": 362.171875, + "learning_rate": 1.995902258524953e-05, + "loss": 14.5295, + "num_tokens": 179898450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 733.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 412.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1263121959545959, + "frac_reward_zero_std": 1.0, + "grad_norm": 44787129549.122536, + "kl": 39845890.703125, + "learning_rate": 1.9958482009761234e-05, + "loss": 1598393.125, + "num_tokens": 180126898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 640.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 497.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1264828881112913, + "frac_reward_zero_std": 1.0, + "grad_norm": 7405302.914075755, + "kl": 115.1328125, + "learning_rate": 1.9957937899371308e-05, + "loss": 4.5952, + "num_tokens": 180332274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 744.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 527.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12665358026798668, + "frac_reward_zero_std": 1.0, + "grad_norm": 17069889.373688217, + "kl": 93698.3359375, + "learning_rate": 1.9957390254272893e-05, + "loss": 3747.1375, + "num_tokens": 180563810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1747.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 897.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 563.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12682427242468208, + "frac_reward_zero_std": 1.0, + "grad_norm": 2760344.9764903877, + "kl": 189487.27734375, + "learning_rate": 1.9956839074660372e-05, + "loss": 7597.9087, + "num_tokens": 180849058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 737.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 457.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1269949645813775, + "frac_reward_zero_std": 1.0, + "grad_norm": 855728.5097447829, + "kl": 50.87890625, + "learning_rate": 1.9956284360729404e-05, + "loss": 2.0355, + "num_tokens": 181079026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1232.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 689.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 424.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1271656567380729, + "frac_reward_zero_std": 1.0, + "grad_norm": 11.064533737030905, + "kl": 2.7734375, + "learning_rate": 1.9955726112676887e-05, + "loss": 0.1113, + "num_tokens": 181295058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 699.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 369.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12733634889476828, + "frac_reward_zero_std": 1.0, + "grad_norm": 275.89776689268933, + "kl": 3.68359375, + "learning_rate": 1.9955164330700985e-05, + "loss": 0.1473, + "num_tokens": 181514738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 631.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 333.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12750704105146368, + "frac_reward_zero_std": 1.0, + "grad_norm": 2022.3885515940128, + "kl": 3.078125, + "learning_rate": 1.99545990150011e-05, + "loss": 0.1232, + "num_tokens": 181713698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 725.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 508.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1276777332081591, + "frac_reward_zero_std": 1.0, + "grad_norm": 142053.6708265352, + "kl": 5.73046875, + "learning_rate": 1.995403016577791e-05, + "loss": 0.2293, + "num_tokens": 181941794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 614.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 496.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1278484253648545, + "frac_reward_zero_std": 1.0, + "grad_norm": 67185.60371920215, + "kl": 398.04296875, + "learning_rate": 1.9953457783233328e-05, + "loss": 16.0141, + "num_tokens": 182145410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 375.375, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12801911752154987, + "frac_reward_zero_std": 1.0, + "grad_norm": 254546.04852667812, + "kl": 11.11328125, + "learning_rate": 1.9952881867570536e-05, + "loss": 0.4439, + "num_tokens": 182283010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12818980967824528, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.322440299643832, + "kl": 4.03125, + "learning_rate": 1.995230241899396e-05, + "loss": 0.1611, + "num_tokens": 182394210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 202.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1283605018349407, + "frac_reward_zero_std": 1.0, + "grad_norm": 11.47351526705353, + "kl": 4.11328125, + "learning_rate": 1.995171943770928e-05, + "loss": 0.1648, + "num_tokens": 182505458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1285311939916361, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.927302983261259, + "kl": 4.4375, + "learning_rate": 1.9951132923923434e-05, + "loss": 0.1774, + "num_tokens": 182600786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 206.875, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 156.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12870188614833147, + "frac_reward_zero_std": 1.0, + "grad_norm": 1184.709717308248, + "kl": 8.625, + "learning_rate": 1.9950542877844617e-05, + "loss": 0.3451, + "num_tokens": 182689234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 209.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12887257830502688, + "frac_reward_zero_std": 1.0, + "grad_norm": 123.29548556815736, + "kl": 4.2578125, + "learning_rate": 1.9949949299682277e-05, + "loss": 0.1702, + "num_tokens": 182782146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 167.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1290432704617223, + "frac_reward_zero_std": 1.0, + "grad_norm": 33.343108785208884, + "kl": 5.0234375, + "learning_rate": 1.9949352189647104e-05, + "loss": 0.2006, + "num_tokens": 182875858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1292139626184177, + "frac_reward_zero_std": 1.0, + "grad_norm": 268.8253704834943, + "kl": 3.61328125, + "learning_rate": 1.9948751547951052e-05, + "loss": 0.1445, + "num_tokens": 182988226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 336.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 258.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12938465477511307, + "frac_reward_zero_std": 1.0, + "grad_norm": 1269.2330728252816, + "kl": 3.4140625, + "learning_rate": 1.9948147374807334e-05, + "loss": 0.1365, + "num_tokens": 183111970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 915.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 788.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12955534693180848, + "frac_reward_zero_std": 1.0, + "grad_norm": 77.1728255635161, + "kl": 2.9921875, + "learning_rate": 1.99475396704304e-05, + "loss": 0.1199, + "num_tokens": 183385330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12972603908850389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6433400156430146, + "kl": 0.242919921875, + "learning_rate": 1.9946928435035976e-05, + "loss": 0.0097, + "num_tokens": 183951842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1298967312451993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03917762723911757, + "kl": 0.05255126953125, + "learning_rate": 1.9946313668841018e-05, + "loss": 0.0021, + "num_tokens": 184519858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13006742340189467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05872796222531741, + "kl": 0.05364990234375, + "learning_rate": 1.9945695372063746e-05, + "loss": 0.0021, + "num_tokens": 185083794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13023811555859008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03548306441524112, + "kl": 0.049072265625, + "learning_rate": 1.994507354492364e-05, + "loss": 0.002, + "num_tokens": 185648706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13040880771528549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02350873097021426, + "kl": 0.03997802734375, + "learning_rate": 1.994444818764142e-05, + "loss": 0.0016, + "num_tokens": 186211762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1305794998719809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23596130381815494, + "kl": 0.05682373046875, + "learning_rate": 1.9943819300439065e-05, + "loss": 0.0023, + "num_tokens": 186779698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13075019202867627, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.432822755536626, + "kl": 0.36724853515625, + "learning_rate": 1.9943186883539817e-05, + "loss": 0.0147, + "num_tokens": 187348866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13092088418537168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08899866071849767, + "kl": 0.1475830078125, + "learning_rate": 1.9942550937168147e-05, + "loss": 0.0059, + "num_tokens": 187910402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13109157634206708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5983492988258818, + "kl": 0.7939453125, + "learning_rate": 1.99419114615498e-05, + "loss": 0.0317, + "num_tokens": 188475042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1669.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1148.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1312622684987625, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0221602895798088, + "kl": 1.6328125, + "learning_rate": 1.994126845691177e-05, + "loss": 0.0654, + "num_tokens": 188945314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1487.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1015.125, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 676.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13143296065545787, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.48825249012621, + "kl": 2.025390625, + "learning_rate": 1.9940621923482296e-05, + "loss": 0.081, + "num_tokens": 189262722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 827.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 580.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13160365281215328, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.8189967500448683, + "kl": 2.28515625, + "learning_rate": 1.9939971861490874e-05, + "loss": 0.0912, + "num_tokens": 189511618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 572.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 345.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13177434496884868, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2623876853271063, + "kl": 2.5546875, + "learning_rate": 1.9939318271168253e-05, + "loss": 0.1023, + "num_tokens": 189697442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1643.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1090.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1319450371255441, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.132396811815504, + "kl": 2.19921875, + "learning_rate": 1.9938661152746437e-05, + "loss": 0.088, + "num_tokens": 190163026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13211572928223947, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3473036216431757, + "kl": 0.845703125, + "learning_rate": 1.9938000506458676e-05, + "loss": 0.0338, + "num_tokens": 190730050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13228642143893488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4000217299619079, + "kl": 0.20166015625, + "learning_rate": 1.9937336332539475e-05, + "loss": 0.0081, + "num_tokens": 191297490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13245711359563028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22250699633926577, + "kl": 0.1009521484375, + "learning_rate": 1.9936668631224593e-05, + "loss": 0.004, + "num_tokens": 191873026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1326278057523257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03134564080767863, + "kl": 0.0546875, + "learning_rate": 1.9935997402751043e-05, + "loss": 0.0022, + "num_tokens": 192442498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13279849790902107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010184910724372677, + "kl": 0.03363037109375, + "learning_rate": 1.9935322647357082e-05, + "loss": 0.0013, + "num_tokens": 193007330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13296919006571647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021538798669497754, + "kl": 0.030548095703125, + "learning_rate": 1.9934644365282224e-05, + "loss": 0.0012, + "num_tokens": 193575538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13313988222241188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04202044718169449, + "kl": 0.03350830078125, + "learning_rate": 1.993396255676724e-05, + "loss": 0.0013, + "num_tokens": 194138706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1333105743791073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018263214329100916, + "kl": 0.030609130859375, + "learning_rate": 1.993327722205414e-05, + "loss": 0.0012, + "num_tokens": 194709634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13348126653580267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13429750347558544, + "kl": 0.044189453125, + "learning_rate": 1.9932588361386197e-05, + "loss": 0.0018, + "num_tokens": 195271810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13365195869249807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013481596043326735, + "kl": 0.034423828125, + "learning_rate": 1.9931895975007934e-05, + "loss": 0.0014, + "num_tokens": 195835202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13382265084919348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0351937327845454, + "kl": 0.04278564453125, + "learning_rate": 1.993120006316512e-05, + "loss": 0.0017, + "num_tokens": 196399394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1339933430058889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1013224673964448, + "kl": 0.05615234375, + "learning_rate": 1.993050062610478e-05, + "loss": 0.0022, + "num_tokens": 196968002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13416403516258427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8171434144683819, + "kl": 0.1031494140625, + "learning_rate": 1.992979766407519e-05, + "loss": 0.0041, + "num_tokens": 197529026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13433472731927967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26805180664177286, + "kl": 0.1221923828125, + "learning_rate": 1.992909117732587e-05, + "loss": 0.0049, + "num_tokens": 198094674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13450541947597508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1183691169669717, + "kl": 0.1224365234375, + "learning_rate": 1.9928381166107605e-05, + "loss": 0.0049, + "num_tokens": 198652962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1346761116326705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08920682114953142, + "kl": 0.1826171875, + "learning_rate": 1.9927667630672417e-05, + "loss": 0.0073, + "num_tokens": 199217650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13484680378936587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05313385359567612, + "kl": 0.1087646484375, + "learning_rate": 1.992695057127359e-05, + "loss": 0.0044, + "num_tokens": 199776738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13501749594606127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032641365576106696, + "kl": 0.1187744140625, + "learning_rate": 1.9926229988165657e-05, + "loss": 0.0048, + "num_tokens": 200343138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13518818810275668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043895150061592024, + "kl": 0.116943359375, + "learning_rate": 1.9925505881604388e-05, + "loss": 0.0047, + "num_tokens": 200911330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13535888025945209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06483485598369296, + "kl": 0.19384765625, + "learning_rate": 1.992477825184682e-05, + "loss": 0.0077, + "num_tokens": 201475874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13552957241614746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07127669468573677, + "kl": 0.1221923828125, + "learning_rate": 1.9924047099151244e-05, + "loss": 0.0049, + "num_tokens": 202036626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13570026457284287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07714095948926453, + "kl": 0.1591796875, + "learning_rate": 1.992331242377718e-05, + "loss": 0.0064, + "num_tokens": 202602882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13587095672953828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10108805881739669, + "kl": 0.199951171875, + "learning_rate": 1.9922574225985416e-05, + "loss": 0.008, + "num_tokens": 203164610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13604164888623368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1535540577153364, + "kl": 0.228515625, + "learning_rate": 1.9921832506037988e-05, + "loss": 0.0091, + "num_tokens": 203734434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13621234104292906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11042917680385281, + "kl": 0.28076171875, + "learning_rate": 1.9921087264198178e-05, + "loss": 0.0112, + "num_tokens": 204300834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13638303319962447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0653829775765295, + "kl": 0.221435546875, + "learning_rate": 1.9920338500730517e-05, + "loss": 0.0089, + "num_tokens": 204868642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13655372535631988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19332745564080916, + "kl": 0.1734619140625, + "learning_rate": 1.9919586215900797e-05, + "loss": 0.0069, + "num_tokens": 205439282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13672441751301528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0471179621043286, + "kl": 0.165283203125, + "learning_rate": 1.9918830409976044e-05, + "loss": 0.0066, + "num_tokens": 206015458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1368951096697107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021520473846295542, + "kl": 0.1710205078125, + "learning_rate": 1.9918071083224544e-05, + "loss": 0.0069, + "num_tokens": 206578898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13706580182640607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03151762932612329, + "kl": 0.12255859375, + "learning_rate": 1.9917308235915832e-05, + "loss": 0.0049, + "num_tokens": 207136354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13723649398310148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035568126793937144, + "kl": 0.11767578125, + "learning_rate": 1.991654186832069e-05, + "loss": 0.0047, + "num_tokens": 207697890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13740718613979688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052420769193027704, + "kl": 0.135009765625, + "learning_rate": 1.9915771980711156e-05, + "loss": 0.0054, + "num_tokens": 208269538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1375778782964923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03557686960139483, + "kl": 0.10205078125, + "learning_rate": 1.99149985733605e-05, + "loss": 0.0041, + "num_tokens": 208837474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13774857045318767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034251501227480115, + "kl": 0.0992431640625, + "learning_rate": 1.9914221646543268e-05, + "loss": 0.004, + "num_tokens": 209403138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13791926260988308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044057798300835026, + "kl": 0.1051025390625, + "learning_rate": 1.991344120053523e-05, + "loss": 0.0042, + "num_tokens": 209967986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13808995476657848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025263308516322684, + "kl": 0.1146240234375, + "learning_rate": 1.9912657235613425e-05, + "loss": 0.0046, + "num_tokens": 210532834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1382606469232739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03470456282808067, + "kl": 0.08251953125, + "learning_rate": 1.9911869752056122e-05, + "loss": 0.0033, + "num_tokens": 211098658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13843133907996927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01307327683035305, + "kl": 0.080810546875, + "learning_rate": 1.9911078750142857e-05, + "loss": 0.0032, + "num_tokens": 211664834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13860203123666467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03218731032601359, + "kl": 0.086181640625, + "learning_rate": 1.9910284230154408e-05, + "loss": 0.0034, + "num_tokens": 212226786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13877272339336008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03620225727260169, + "kl": 0.1134033203125, + "learning_rate": 1.9909486192372795e-05, + "loss": 0.0045, + "num_tokens": 212798194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1389434155500555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03609948961735499, + "kl": 0.1474609375, + "learning_rate": 1.9908684637081297e-05, + "loss": 0.0059, + "num_tokens": 213374578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13911410770675087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06785377609447096, + "kl": 0.175537109375, + "learning_rate": 1.9907879564564436e-05, + "loss": 0.007, + "num_tokens": 213950466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13928479986344627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05271620094348085, + "kl": 0.168212890625, + "learning_rate": 1.9907070975107984e-05, + "loss": 0.0067, + "num_tokens": 214517890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13945549202014168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04906335655960044, + "kl": 0.161865234375, + "learning_rate": 1.9906258868998956e-05, + "loss": 0.0065, + "num_tokens": 215083794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1396261841768371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012980426899897835, + "kl": 0.0985107421875, + "learning_rate": 1.9905443246525628e-05, + "loss": 0.0039, + "num_tokens": 215648866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13979687633353247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0431083264215784, + "kl": 0.1116943359375, + "learning_rate": 1.9904624107977515e-05, + "loss": 0.0045, + "num_tokens": 216221026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13996756849022787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029301781219184867, + "kl": 0.13427734375, + "learning_rate": 1.9903801453645378e-05, + "loss": 0.0054, + "num_tokens": 216793858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14013826064692328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03227747779050005, + "kl": 0.15283203125, + "learning_rate": 1.9902975283821232e-05, + "loss": 0.0061, + "num_tokens": 217357986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1403089528036187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07758406155825776, + "kl": 0.220703125, + "learning_rate": 1.9902145598798338e-05, + "loss": 0.0088, + "num_tokens": 217924946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14047964496031407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02186845244589191, + "kl": 0.178466796875, + "learning_rate": 1.9901312398871205e-05, + "loss": 0.0071, + "num_tokens": 218491762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14065033711700947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05131216668750871, + "kl": 0.13134765625, + "learning_rate": 1.9900475684335582e-05, + "loss": 0.0053, + "num_tokens": 219056386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14082102927370488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047352579158381196, + "kl": 0.14208984375, + "learning_rate": 1.9899635455488482e-05, + "loss": 0.0057, + "num_tokens": 219615586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14099172143040029, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020087364287833614, + "kl": 0.1806640625, + "learning_rate": 1.989879171262815e-05, + "loss": 0.0072, + "num_tokens": 220179442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14116241358709566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07582448437569422, + "kl": 0.27783203125, + "learning_rate": 1.9897944456054087e-05, + "loss": 0.0111, + "num_tokens": 220750162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14133310574379107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021893970158485872, + "kl": 0.181640625, + "learning_rate": 1.9897093686067035e-05, + "loss": 0.0073, + "num_tokens": 221321170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14150379790048648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030415986307252462, + "kl": 0.181884765625, + "learning_rate": 1.989623940296899e-05, + "loss": 0.0073, + "num_tokens": 221888018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14167449005718188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048769427564913995, + "kl": 0.17724609375, + "learning_rate": 1.9895381607063195e-05, + "loss": 0.0071, + "num_tokens": 222449378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14184518221387726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04639108322998359, + "kl": 0.248291015625, + "learning_rate": 1.9894520298654126e-05, + "loss": 0.0099, + "num_tokens": 223013218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14201587437057267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06178418053909068, + "kl": 0.2578125, + "learning_rate": 1.9893655478047526e-05, + "loss": 0.0103, + "num_tokens": 223581298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14218656652726808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10838024688696596, + "kl": 0.178955078125, + "learning_rate": 1.9892787145550372e-05, + "loss": 0.0072, + "num_tokens": 224147682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14235725868396348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10328750669189175, + "kl": 0.34228515625, + "learning_rate": 1.989191530147089e-05, + "loss": 0.0137, + "num_tokens": 224707314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14252795084065886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10770203478849905, + "kl": 0.22705078125, + "learning_rate": 1.989103994611855e-05, + "loss": 0.0091, + "num_tokens": 225273490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14269864299735427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1641869667554322, + "kl": 0.4052734375, + "learning_rate": 1.989016107980408e-05, + "loss": 0.0162, + "num_tokens": 225837330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14286933515404968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2352899517490799, + "kl": 0.179443359375, + "learning_rate": 1.9889278702839438e-05, + "loss": 0.0072, + "num_tokens": 226398258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14304002731074508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23251697361937934, + "kl": 0.49951171875, + "learning_rate": 1.988839281553784e-05, + "loss": 0.02, + "num_tokens": 226963602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14321071946744046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3120094227053712, + "kl": 0.221923828125, + "learning_rate": 1.9887503418213746e-05, + "loss": 0.0089, + "num_tokens": 227529570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14338141162413587, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.000521064730851, + "kl": 0.7734375, + "learning_rate": 1.9886610511182853e-05, + "loss": 0.031, + "num_tokens": 228102258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14355210378083127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18114718935279467, + "kl": 0.0599365234375, + "learning_rate": 1.9885714094762116e-05, + "loss": 0.0024, + "num_tokens": 228671810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14372279593752668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04536983922075339, + "kl": 0.0328369140625, + "learning_rate": 1.9884814169269727e-05, + "loss": 0.0013, + "num_tokens": 229241250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14389348809422206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11043356237391412, + "kl": 0.03363037109375, + "learning_rate": 1.988391073502513e-05, + "loss": 0.0013, + "num_tokens": 229805570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14406418025091747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009289415442425659, + "kl": 0.021484375, + "learning_rate": 1.9883003792349013e-05, + "loss": 0.0009, + "num_tokens": 230373554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14423487240761287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25483851275420755, + "kl": 0.0426025390625, + "learning_rate": 1.9882093341563307e-05, + "loss": 0.0017, + "num_tokens": 230937618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14440556456430828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028344707007921528, + "kl": 0.027313232421875, + "learning_rate": 1.9881179382991186e-05, + "loss": 0.0011, + "num_tokens": 231505698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14457625672100366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055398222000130316, + "kl": 0.0733642578125, + "learning_rate": 1.9880261916957072e-05, + "loss": 0.0029, + "num_tokens": 232069682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14474694887769907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10992494823700213, + "kl": 0.1285400390625, + "learning_rate": 1.9879340943786635e-05, + "loss": 0.0051, + "num_tokens": 232636034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14491764103439447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4622734759391114, + "kl": 0.37109375, + "learning_rate": 1.9878416463806788e-05, + "loss": 0.0149, + "num_tokens": 233210930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14508833319108988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5775442993331429, + "kl": 1.033203125, + "learning_rate": 1.9877488477345686e-05, + "loss": 0.0413, + "num_tokens": 233774866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14525902534778526, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.046079677890141, + "kl": 1.47265625, + "learning_rate": 1.987655698473273e-05, + "loss": 0.0589, + "num_tokens": 234342946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14542971750448067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3083906007157144, + "kl": 0.4208984375, + "learning_rate": 1.987562198629857e-05, + "loss": 0.0168, + "num_tokens": 234904658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14560040966117607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09002205732718613, + "kl": 0.12158203125, + "learning_rate": 1.9874683482375094e-05, + "loss": 0.0049, + "num_tokens": 235473426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14577110181787148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020412292427936737, + "kl": 0.03759765625, + "learning_rate": 1.987374147329544e-05, + "loss": 0.0015, + "num_tokens": 236037426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14594179397456686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0053464798407439895, + "kl": 0.019866943359375, + "learning_rate": 1.987279595939398e-05, + "loss": 0.0008, + "num_tokens": 236600226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14611248613126226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033214034975859834, + "kl": 0.015380859375, + "learning_rate": 1.9871846941006344e-05, + "loss": 0.0006, + "num_tokens": 237162418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14628317828795767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014290804899969795, + "kl": 0.0139312744140625, + "learning_rate": 1.9870894418469394e-05, + "loss": 0.0006, + "num_tokens": 237732802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14645387044465308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010709271190365577, + "kl": 0.01373291015625, + "learning_rate": 1.986993839212125e-05, + "loss": 0.0005, + "num_tokens": 238296194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14662456260134846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012025866239440397, + "kl": 0.012969970703125, + "learning_rate": 1.9868978862301257e-05, + "loss": 0.0005, + "num_tokens": 238862242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14679525475804386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006455409014851532, + "kl": 0.0128326416015625, + "learning_rate": 1.9868015829350018e-05, + "loss": 0.0005, + "num_tokens": 239438818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14696594691473927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015345572051321604, + "kl": 0.013031005859375, + "learning_rate": 1.9867049293609374e-05, + "loss": 0.0005, + "num_tokens": 240004706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14713663907143468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039475932691884655, + "kl": 0.0119476318359375, + "learning_rate": 1.986607925542241e-05, + "loss": 0.0005, + "num_tokens": 240575138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14730733122813006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003649650150177489, + "kl": 0.0123443603515625, + "learning_rate": 1.9865105715133462e-05, + "loss": 0.0005, + "num_tokens": 241142242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14747802338482546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002708245440542498, + "kl": 0.011962890625, + "learning_rate": 1.986412867308809e-05, + "loss": 0.0005, + "num_tokens": 241718498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14764871554152087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003253749662937562, + "kl": 0.01226806640625, + "learning_rate": 1.986314812963311e-05, + "loss": 0.0005, + "num_tokens": 242285442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14781940769821628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002882922598676411, + "kl": 0.012298583984375, + "learning_rate": 1.986216408511659e-05, + "loss": 0.0005, + "num_tokens": 242855394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14799009985491166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002487848410101968, + "kl": 0.0132904052734375, + "learning_rate": 1.986117653988782e-05, + "loss": 0.0005, + "num_tokens": 243424450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14816079201160706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017960802058997517, + "kl": 0.012542724609375, + "learning_rate": 1.9860185494297348e-05, + "loss": 0.0005, + "num_tokens": 243986562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14833148416830247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010961289077864841, + "kl": 0.0121917724609375, + "learning_rate": 1.9859190948696952e-05, + "loss": 0.0005, + "num_tokens": 244552690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14850217632499788, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.421187557279197e-05, + "kl": 0.011688232421875, + "learning_rate": 1.9858192903439674e-05, + "loss": 0.0005, + "num_tokens": 245120114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14867286848169325, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.823628723432941e-05, + "kl": 0.0121917724609375, + "learning_rate": 1.985719135887977e-05, + "loss": 0.0005, + "num_tokens": 245684194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14884356063838866, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.890091272344082e-05, + "kl": 0.01239013671875, + "learning_rate": 1.985618631537276e-05, + "loss": 0.0005, + "num_tokens": 246251954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14901425279508407, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.130088646895008e-05, + "kl": 0.0133514404296875, + "learning_rate": 1.9855177773275395e-05, + "loss": 0.0005, + "num_tokens": 246818770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14918494495177947, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.2028651084818875e-05, + "kl": 0.0126190185546875, + "learning_rate": 1.9854165732945674e-05, + "loss": 0.0005, + "num_tokens": 247388098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14935563710847485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010728495809936496, + "kl": 0.012542724609375, + "learning_rate": 1.985315019474283e-05, + "loss": 0.0005, + "num_tokens": 247966946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14952632926517026, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.253251315829226e-05, + "kl": 0.0127105712890625, + "learning_rate": 1.9852131159027347e-05, + "loss": 0.0005, + "num_tokens": 248527474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14969702142186567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011537532289168226, + "kl": 0.0125732421875, + "learning_rate": 1.9851108626160943e-05, + "loss": 0.0005, + "num_tokens": 249089410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14986771357856107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012029058602553531, + "kl": 0.0123291015625, + "learning_rate": 1.985008259650658e-05, + "loss": 0.0005, + "num_tokens": 249652770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15003840573525645, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.578182739965999e-05, + "kl": 0.0121917724609375, + "learning_rate": 1.9849053070428466e-05, + "loss": 0.0005, + "num_tokens": 250218706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15020909789195186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001312227042086953, + "kl": 0.0121917724609375, + "learning_rate": 1.9848020048292044e-05, + "loss": 0.0005, + "num_tokens": 250787986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15037979004864727, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.544618796949548e-05, + "kl": 0.0128173828125, + "learning_rate": 1.9846983530463994e-05, + "loss": 0.0005, + "num_tokens": 251349666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15055048220534267, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4188850034247434e-05, + "kl": 0.0128326416015625, + "learning_rate": 1.9845943517312247e-05, + "loss": 0.0005, + "num_tokens": 251910306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15072117436203805, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.478480688521962e-05, + "kl": 0.0118865966796875, + "learning_rate": 1.984490000920597e-05, + "loss": 0.0005, + "num_tokens": 252486578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15089186651873346, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.461169398852708e-05, + "kl": 0.0122833251953125, + "learning_rate": 1.984385300651557e-05, + "loss": 0.0005, + "num_tokens": 253049186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15106255867542887, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.867737112693473e-05, + "kl": 0.0125732421875, + "learning_rate": 1.9842802509612695e-05, + "loss": 0.0005, + "num_tokens": 253615714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15123325083212427, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3623437495374083e-05, + "kl": 0.012359619140625, + "learning_rate": 1.9841748518870233e-05, + "loss": 0.0005, + "num_tokens": 254186290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15140394298881965, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.899213696986903e-05, + "kl": 0.0129852294921875, + "learning_rate": 1.9840691034662316e-05, + "loss": 0.0005, + "num_tokens": 254755234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15157463514551506, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2373538573736763e-05, + "kl": 0.0126190185546875, + "learning_rate": 1.9839630057364308e-05, + "loss": 0.0005, + "num_tokens": 255318706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15174532730221046, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.4353842308471094e-05, + "kl": 0.0123443603515625, + "learning_rate": 1.983856558735282e-05, + "loss": 0.0005, + "num_tokens": 255884034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15191601945890587, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4438415012426605e-07, + "kl": 0.0122528076171875, + "learning_rate": 1.9837497625005703e-05, + "loss": 0.0005, + "num_tokens": 256449922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15208671161560125, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.804926427991375e-05, + "kl": 0.0122528076171875, + "learning_rate": 1.983642617070204e-05, + "loss": 0.0005, + "num_tokens": 257013282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15225740377229666, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2192067162995967e-05, + "kl": 0.0124053955078125, + "learning_rate": 1.9835351224822164e-05, + "loss": 0.0005, + "num_tokens": 257579154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15242809592899206, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.66848822221725e-05, + "kl": 0.012603759765625, + "learning_rate": 1.9834272787747635e-05, + "loss": 0.0005, + "num_tokens": 258142610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15259878808568747, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.817627074126774e-05, + "kl": 0.01202392578125, + "learning_rate": 1.9833190859861266e-05, + "loss": 0.0005, + "num_tokens": 258705218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15276948024238285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011334205238891047, + "kl": 0.0121307373046875, + "learning_rate": 1.9832105441547094e-05, + "loss": 0.0005, + "num_tokens": 259271970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15294017239907826, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.48184873713411e-05, + "kl": 0.0120697021484375, + "learning_rate": 1.9831016533190416e-05, + "loss": 0.0005, + "num_tokens": 259837202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15311086455577366, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.628390995088657e-05, + "kl": 0.012054443359375, + "learning_rate": 1.9829924135177748e-05, + "loss": 0.0005, + "num_tokens": 260400722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15328155671246907, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.033109610223419e-05, + "kl": 0.0125732421875, + "learning_rate": 1.982882824789685e-05, + "loss": 0.0005, + "num_tokens": 260962146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15345224886916445, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.341145376595258e-05, + "kl": 0.012451171875, + "learning_rate": 1.982772887173672e-05, + "loss": 0.0005, + "num_tokens": 261524514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15362294102585985, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.014933714521547e-05, + "kl": 0.0114898681640625, + "learning_rate": 1.9826626007087604e-05, + "loss": 0.0005, + "num_tokens": 262091170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15379363318255526, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.404990485584357e-05, + "kl": 0.011810302734375, + "learning_rate": 1.9825519654340975e-05, + "loss": 0.0005, + "num_tokens": 262660162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15396432533925067, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.01208415117131e-05, + "kl": 0.01220703125, + "learning_rate": 1.9824409813889555e-05, + "loss": 0.0005, + "num_tokens": 263226866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15413501749594608, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1744781085752478e-07, + "kl": 0.0122222900390625, + "learning_rate": 1.982329648612729e-05, + "loss": 0.0005, + "num_tokens": 263786242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15430570965264145, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.08429542540049e-05, + "kl": 0.0120849609375, + "learning_rate": 1.982217967144937e-05, + "loss": 0.0005, + "num_tokens": 264359250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15447640180933686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002616142645685918, + "kl": 0.0126953125, + "learning_rate": 1.9821059370252227e-05, + "loss": 0.0005, + "num_tokens": 264927778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15464709396603227, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.503839220831112e-08, + "kl": 0.0123748779296875, + "learning_rate": 1.981993558293353e-05, + "loss": 0.0005, + "num_tokens": 265499266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15481778612272767, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.958692034834558e-05, + "kl": 0.012359619140625, + "learning_rate": 1.981880830989218e-05, + "loss": 0.0005, + "num_tokens": 266064834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15498847827942305, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.5388150360435325e-05, + "kl": 0.01239013671875, + "learning_rate": 1.9817677551528314e-05, + "loss": 0.0005, + "num_tokens": 266627138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15515917043611846, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0568518628125496e-07, + "kl": 0.012786865234375, + "learning_rate": 1.9816543308243323e-05, + "loss": 0.0005, + "num_tokens": 267195490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15532986259281387, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.595388093545259e-05, + "kl": 0.011932373046875, + "learning_rate": 1.9815405580439807e-05, + "loss": 0.0005, + "num_tokens": 267762722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15550055474950927, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.32292414756021e-05, + "kl": 0.011749267578125, + "learning_rate": 1.9814264368521632e-05, + "loss": 0.0005, + "num_tokens": 268346994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15567124690620465, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.940931886396355e-08, + "kl": 0.0124969482421875, + "learning_rate": 1.9813119672893877e-05, + "loss": 0.0005, + "num_tokens": 268917138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15584193906290006, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2643781664334487e-07, + "kl": 0.0124359130859375, + "learning_rate": 1.9811971493962872e-05, + "loss": 0.0005, + "num_tokens": 269478882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15601263121959547, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.1030901318589694e-05, + "kl": 0.01287841796875, + "learning_rate": 1.9810819832136178e-05, + "loss": 0.0005, + "num_tokens": 270041794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15618332337629087, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4382899836983464e-05, + "kl": 0.0122222900390625, + "learning_rate": 1.9809664687822597e-05, + "loss": 0.0005, + "num_tokens": 270608530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15635401553298625, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.625155062135161e-05, + "kl": 0.0122833251953125, + "learning_rate": 1.9808506061432157e-05, + "loss": 0.0005, + "num_tokens": 271173682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15652470768968166, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.61807758565692e-05, + "kl": 0.01226806640625, + "learning_rate": 1.9807343953376135e-05, + "loss": 0.0005, + "num_tokens": 271737602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15669539984637706, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0788806927822322e-07, + "kl": 0.0120697021484375, + "learning_rate": 1.980617836406703e-05, + "loss": 0.0005, + "num_tokens": 272303202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15686609200307247, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.309956603412955e-05, + "kl": 0.011749267578125, + "learning_rate": 1.9805009293918592e-05, + "loss": 0.0005, + "num_tokens": 272876850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15703678415976785, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6060371220844006e-05, + "kl": 0.0115814208984375, + "learning_rate": 1.980383674334579e-05, + "loss": 0.0005, + "num_tokens": 273437298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15720747631646326, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2714805759457508e-05, + "kl": 0.0123291015625, + "learning_rate": 1.980266071276485e-05, + "loss": 0.0005, + "num_tokens": 274000386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15737816847315866, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.0522798012505016e-05, + "kl": 0.01214599609375, + "learning_rate": 1.9801481202593206e-05, + "loss": 0.0005, + "num_tokens": 274570530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15754886062985407, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.1665151597276325e-05, + "kl": 0.0124969482421875, + "learning_rate": 1.9800298213249552e-05, + "loss": 0.0005, + "num_tokens": 275137314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15771955278654945, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.185267852163626e-05, + "kl": 0.0126190185546875, + "learning_rate": 1.97991117451538e-05, + "loss": 0.0005, + "num_tokens": 275701378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15789024494324486, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.657764468895925e-05, + "kl": 0.011993408203125, + "learning_rate": 1.9797921798727112e-05, + "loss": 0.0005, + "num_tokens": 276283922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15806093709994026, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.035178850670171e-05, + "kl": 0.0125885009765625, + "learning_rate": 1.9796728374391866e-05, + "loss": 0.0005, + "num_tokens": 276852050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15823162925663567, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.464914690260456e-05, + "kl": 0.011749267578125, + "learning_rate": 1.979553147257169e-05, + "loss": 0.0005, + "num_tokens": 277416994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15840232141333105, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2519696553293854e-05, + "kl": 0.0125885009765625, + "learning_rate": 1.979433109369144e-05, + "loss": 0.0005, + "num_tokens": 277979874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15857301357002646, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2605000028278823e-05, + "kl": 0.0119476318359375, + "learning_rate": 1.97931272381772e-05, + "loss": 0.0005, + "num_tokens": 278544658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15874370572672186, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.07439275758633e-05, + "kl": 0.0120697021484375, + "learning_rate": 1.9791919906456308e-05, + "loss": 0.0005, + "num_tokens": 279108178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15891439788341727, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.750632121533806e-05, + "kl": 0.0120697021484375, + "learning_rate": 1.9790709098957316e-05, + "loss": 0.0005, + "num_tokens": 279672002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15908509004011265, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.595290626462459e-05, + "kl": 0.0123291015625, + "learning_rate": 1.978949481611002e-05, + "loss": 0.0005, + "num_tokens": 280238002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15925578219680805, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.40874742851031e-05, + "kl": 0.011932373046875, + "learning_rate": 1.978827705834544e-05, + "loss": 0.0005, + "num_tokens": 280804962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15942647435350346, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.6603330802659185e-05, + "kl": 0.0119171142578125, + "learning_rate": 1.9787055826095847e-05, + "loss": 0.0005, + "num_tokens": 281369666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15959716651019887, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0604987603022423e-07, + "kl": 0.0123138427734375, + "learning_rate": 1.9785831119794726e-05, + "loss": 0.0005, + "num_tokens": 281929714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15976785866689425, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2541760774438848e-05, + "kl": 0.0135040283203125, + "learning_rate": 1.9784602939876804e-05, + "loss": 0.0005, + "num_tokens": 282494050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15993855082358965, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.513093721717969e-05, + "kl": 0.0130462646484375, + "learning_rate": 1.9783371286778043e-05, + "loss": 0.0005, + "num_tokens": 283057538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16010924298028506, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.354839001773786e-08, + "kl": 0.01226806640625, + "learning_rate": 1.9782136160935637e-05, + "loss": 0.0005, + "num_tokens": 283623378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16027993513698047, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.044939865071165e-08, + "kl": 0.012176513671875, + "learning_rate": 1.9780897562788003e-05, + "loss": 0.0005, + "num_tokens": 284190626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16045062729367585, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.609704695633043e-05, + "kl": 0.0128936767578125, + "learning_rate": 1.977965549277481e-05, + "loss": 0.0005, + "num_tokens": 284758306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16062131945037125, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.398013179466143e-05, + "kl": 0.01239013671875, + "learning_rate": 1.9778409951336938e-05, + "loss": 0.0005, + "num_tokens": 285320930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16079201160706666, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.520747832892999e-08, + "kl": 0.0122222900390625, + "learning_rate": 1.9777160938916518e-05, + "loss": 0.0005, + "num_tokens": 285882098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16096270376376207, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.607836757367335e-05, + "kl": 0.0123291015625, + "learning_rate": 1.9775908455956897e-05, + "loss": 0.0005, + "num_tokens": 286439090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16113339592045745, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.822579943249009e-05, + "kl": 0.011932373046875, + "learning_rate": 1.9774652502902666e-05, + "loss": 0.0005, + "num_tokens": 287002018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16130408807715285, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.584892650988797e-08, + "kl": 0.0124969482421875, + "learning_rate": 1.977339308019964e-05, + "loss": 0.0005, + "num_tokens": 287566866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16147478023384826, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2238828043281416e-05, + "kl": 0.0126953125, + "learning_rate": 1.977213018829487e-05, + "loss": 0.0005, + "num_tokens": 288135074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16164547239054367, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2226654669046937e-05, + "kl": 0.0119171142578125, + "learning_rate": 1.9770863827636634e-05, + "loss": 0.0005, + "num_tokens": 288695410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16181616454723904, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.606668600086454e-05, + "kl": 0.012359619140625, + "learning_rate": 1.9769593998674453e-05, + "loss": 0.0005, + "num_tokens": 289257986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16198685670393445, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.6953536220771076e-05, + "kl": 0.0117034912109375, + "learning_rate": 1.9768320701859062e-05, + "loss": 0.0005, + "num_tokens": 289820578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16215754886062986, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.847085974828415e-05, + "kl": 0.012786865234375, + "learning_rate": 1.9767043937642437e-05, + "loss": 0.0005, + "num_tokens": 290388786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16232824101732526, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5617368093633657e-05, + "kl": 0.0122833251953125, + "learning_rate": 1.9765763706477782e-05, + "loss": 0.0005, + "num_tokens": 290959682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16249893317402064, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.691860560784927e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.9764480008819536e-05, + "loss": 0.0005, + "num_tokens": 291520194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16266962533071605, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.587084440749672e-08, + "kl": 0.0125579833984375, + "learning_rate": 1.9763192845123368e-05, + "loss": 0.0005, + "num_tokens": 292085410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16284031748741146, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.291249160696098e-08, + "kl": 0.011566162109375, + "learning_rate": 1.976190221584617e-05, + "loss": 0.0005, + "num_tokens": 292651746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16301100964410686, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.6809619461320284e-05, + "kl": 0.011749267578125, + "learning_rate": 1.9760608121446066e-05, + "loss": 0.0005, + "num_tokens": 293219250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16318170180080224, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.490284388210639e-05, + "kl": 0.012359619140625, + "learning_rate": 1.9759310562382423e-05, + "loss": 0.0005, + "num_tokens": 293782994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16335239395749765, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.140536623521666e-08, + "kl": 0.0124664306640625, + "learning_rate": 1.9758009539115814e-05, + "loss": 0.0005, + "num_tokens": 294347554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16352308611419306, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.526096479633509e-05, + "kl": 0.01202392578125, + "learning_rate": 1.9756705052108068e-05, + "loss": 0.0005, + "num_tokens": 294910658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16369377827088846, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3370530462319156e-05, + "kl": 0.012237548828125, + "learning_rate": 1.9755397101822223e-05, + "loss": 0.0005, + "num_tokens": 295480770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16386447042758384, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.431710096036934e-08, + "kl": 0.012420654296875, + "learning_rate": 1.9754085688722554e-05, + "loss": 0.0005, + "num_tokens": 296042786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16403516258427925, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.852065226991576e-05, + "kl": 0.0122833251953125, + "learning_rate": 1.9752770813274574e-05, + "loss": 0.0005, + "num_tokens": 296607586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16420585474097465, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2105006796053933e-05, + "kl": 0.01251220703125, + "learning_rate": 1.9751452475945005e-05, + "loss": 0.0005, + "num_tokens": 297171426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16437654689767006, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.8447644930544516e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.9750130677201816e-05, + "loss": 0.0005, + "num_tokens": 297742770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16454723905436544, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.492223924879773e-05, + "kl": 0.0121002197265625, + "learning_rate": 1.9748805417514195e-05, + "loss": 0.0005, + "num_tokens": 298309602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16471793121106085, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.529033553272863e-08, + "kl": 0.01226806640625, + "learning_rate": 1.9747476697352567e-05, + "loss": 0.0005, + "num_tokens": 298871730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16488862336775625, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.249998155503558e-08, + "kl": 0.012542724609375, + "learning_rate": 1.974614451718857e-05, + "loss": 0.0005, + "num_tokens": 299436306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16505931552445166, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6544524858116474e-05, + "kl": 0.01239013671875, + "learning_rate": 1.9744808877495084e-05, + "loss": 0.0005, + "num_tokens": 299997458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16523000768114704, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.268020873540216e-08, + "kl": 0.0121612548828125, + "learning_rate": 1.9743469778746222e-05, + "loss": 0.0005, + "num_tokens": 300570722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16540069983784245, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.6169687710273797e-05, + "kl": 0.011962890625, + "learning_rate": 1.9742127221417298e-05, + "loss": 0.0005, + "num_tokens": 301142338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16557139199453785, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.199563270595389e-05, + "kl": 0.0125274658203125, + "learning_rate": 1.9740781205984888e-05, + "loss": 0.0005, + "num_tokens": 301716130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16574208415123326, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.970391168894958e-08, + "kl": 0.0125579833984375, + "learning_rate": 1.9739431732926765e-05, + "loss": 0.0005, + "num_tokens": 302282962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16591277630792864, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.27046824100967e-05, + "kl": 0.0118408203125, + "learning_rate": 1.9738078802721957e-05, + "loss": 0.0005, + "num_tokens": 302851666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16608346846462405, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.0337054339241904e-05, + "kl": 0.011749267578125, + "learning_rate": 1.9736722415850694e-05, + "loss": 0.0005, + "num_tokens": 303418818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16625416062131945, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.726748518173096e-08, + "kl": 0.012176513671875, + "learning_rate": 1.9735362572794454e-05, + "loss": 0.0005, + "num_tokens": 303982594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16642485277801486, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.326410753239673e-08, + "kl": 0.01165771484375, + "learning_rate": 1.973399927403592e-05, + "loss": 0.0005, + "num_tokens": 304548226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16659554493471024, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.9814063961840816e-05, + "kl": 0.01214599609375, + "learning_rate": 1.9732632520059025e-05, + "loss": 0.0005, + "num_tokens": 305111170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16676623709140564, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.374964419054603e-05, + "kl": 0.012542724609375, + "learning_rate": 1.9731262311348913e-05, + "loss": 0.0005, + "num_tokens": 305682370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16693692924810105, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.979226579526443e-08, + "kl": 0.012542724609375, + "learning_rate": 1.972988864839196e-05, + "loss": 0.0005, + "num_tokens": 306250434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16710762140479646, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.617391432473047e-05, + "kl": 0.012115478515625, + "learning_rate": 1.9728511531675763e-05, + "loss": 0.0005, + "num_tokens": 306817618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16727831356149184, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.919443417564609e-08, + "kl": 0.011932373046875, + "learning_rate": 1.9727130961689155e-05, + "loss": 0.0005, + "num_tokens": 307382962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16744900571818724, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.0348863365752445e-05, + "kl": 0.012237548828125, + "learning_rate": 1.972574693892218e-05, + "loss": 0.0005, + "num_tokens": 307948978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16761969787488265, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.819596621787608e-08, + "kl": 0.0123748779296875, + "learning_rate": 1.9724359463866123e-05, + "loss": 0.0005, + "num_tokens": 308511362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16779039003157806, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.079332992234702e-08, + "kl": 0.0121612548828125, + "learning_rate": 1.9722968537013484e-05, + "loss": 0.0005, + "num_tokens": 309074962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16796108218827344, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.527188344182469e-05, + "kl": 0.011962890625, + "learning_rate": 1.9721574158858e-05, + "loss": 0.0005, + "num_tokens": 309640498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16813177434496884, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.590912842901438e-08, + "kl": 0.0120697021484375, + "learning_rate": 1.9720176329894612e-05, + "loss": 0.0005, + "num_tokens": 310204242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16830246650166425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015326553462813316, + "kl": 0.0122833251953125, + "learning_rate": 1.971877505061951e-05, + "loss": 0.0005, + "num_tokens": 310769698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16847315865835966, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2225648454552866e-05, + "kl": 0.01239013671875, + "learning_rate": 1.9717370321530085e-05, + "loss": 0.0005, + "num_tokens": 311332802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16864385081505504, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.661908925291866e-08, + "kl": 0.0122833251953125, + "learning_rate": 1.9715962143124975e-05, + "loss": 0.0005, + "num_tokens": 311892578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16881454297175044, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2284887843704655e-05, + "kl": 0.012420654296875, + "learning_rate": 1.9714550515904033e-05, + "loss": 0.0005, + "num_tokens": 312453170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16898523512844585, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.675662271411803e-08, + "kl": 0.012298583984375, + "learning_rate": 1.9713135440368335e-05, + "loss": 0.0005, + "num_tokens": 313027906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16915592728514126, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.761803218381965e-08, + "kl": 0.012054443359375, + "learning_rate": 1.9711716917020176e-05, + "loss": 0.0005, + "num_tokens": 313592450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16932661944183663, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.546405049375791e-08, + "kl": 0.0120391845703125, + "learning_rate": 1.9710294946363085e-05, + "loss": 0.0005, + "num_tokens": 314152930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16949731159853204, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4734335690773567e-05, + "kl": 0.0119476318359375, + "learning_rate": 1.970886952890181e-05, + "loss": 0.0005, + "num_tokens": 314716690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16966800375522745, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.056744913615493e-05, + "kl": 0.011810302734375, + "learning_rate": 1.9707440665142322e-05, + "loss": 0.0005, + "num_tokens": 315283842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16983869591192285, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.485930820943237e-08, + "kl": 0.011871337890625, + "learning_rate": 1.9706008355591817e-05, + "loss": 0.0005, + "num_tokens": 315853826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17000938806861823, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5095387028940556e-05, + "kl": 0.011810302734375, + "learning_rate": 1.9704572600758712e-05, + "loss": 0.0005, + "num_tokens": 316418594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17018008022531364, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.456631278666253e-08, + "kl": 0.0124359130859375, + "learning_rate": 1.970313340115265e-05, + "loss": 0.0005, + "num_tokens": 316985170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17035077238200905, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.4961091404817605e-05, + "kl": 0.012664794921875, + "learning_rate": 1.970169075728449e-05, + "loss": 0.0005, + "num_tokens": 317548690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17052146453870445, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3409343716431398e-05, + "kl": 0.0124969482421875, + "learning_rate": 1.9700244669666327e-05, + "loss": 0.0005, + "num_tokens": 318110882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17069215669539983, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4642906410868805e-05, + "kl": 0.0125732421875, + "learning_rate": 1.9698795138811463e-05, + "loss": 0.0005, + "num_tokens": 318677490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17086284885209524, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6035500991469905e-05, + "kl": 0.0126800537109375, + "learning_rate": 1.969734216523443e-05, + "loss": 0.0005, + "num_tokens": 319243122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17103354100879065, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.828319232319948e-05, + "kl": 0.0122528076171875, + "learning_rate": 1.9695885749450982e-05, + "loss": 0.0005, + "num_tokens": 319811570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17120423316548605, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.369933259267658e-05, + "kl": 0.01263427734375, + "learning_rate": 1.96944258919781e-05, + "loss": 0.0005, + "num_tokens": 320371586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17137492532218146, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3388914955317687e-05, + "kl": 0.01263427734375, + "learning_rate": 1.9692962593333968e-05, + "loss": 0.0005, + "num_tokens": 320939330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17154561747887684, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.1440316765646417e-05, + "kl": 0.0121917724609375, + "learning_rate": 1.969149585403801e-05, + "loss": 0.0005, + "num_tokens": 321506930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17171630963557225, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6038607111080078e-05, + "kl": 0.0118408203125, + "learning_rate": 1.9690025674610874e-05, + "loss": 0.0005, + "num_tokens": 322077106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17188700179226765, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.742481012518278e-05, + "kl": 0.0127410888671875, + "learning_rate": 1.968855205557441e-05, + "loss": 0.0005, + "num_tokens": 322647346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17205769394896306, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6020353968740242e-05, + "kl": 0.0128021240234375, + "learning_rate": 1.96870749974517e-05, + "loss": 0.0005, + "num_tokens": 323209698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17222838610565844, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.788562820414892e-05, + "kl": 0.0123748779296875, + "learning_rate": 1.9685594500767054e-05, + "loss": 0.0005, + "num_tokens": 323776466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17239907826235384, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.3438902273264e-08, + "kl": 0.0122528076171875, + "learning_rate": 1.9684110566045987e-05, + "loss": 0.0005, + "num_tokens": 324338034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17256977041904925, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.585545232735993e-05, + "kl": 0.01220703125, + "learning_rate": 1.9682623193815243e-05, + "loss": 0.0005, + "num_tokens": 324905282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17274046257574466, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4390423463250524e-05, + "kl": 0.01202392578125, + "learning_rate": 1.9681132384602795e-05, + "loss": 0.0005, + "num_tokens": 325470866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17291115473244004, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.892335979799386e-08, + "kl": 0.0120391845703125, + "learning_rate": 1.9679638138937813e-05, + "loss": 0.0005, + "num_tokens": 326041170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17308184688913544, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.602490550099794e-05, + "kl": 0.01239013671875, + "learning_rate": 1.967814045735071e-05, + "loss": 0.0005, + "num_tokens": 326605506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17325253904583085, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.293686911699716e-05, + "kl": 0.0123748779296875, + "learning_rate": 1.9676639340373103e-05, + "loss": 0.0005, + "num_tokens": 327170706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17342323120252626, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.576449406319233e-05, + "kl": 0.0125274658203125, + "learning_rate": 1.9675134788537838e-05, + "loss": 0.0005, + "num_tokens": 327741602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17359392335922164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013207548352916823, + "kl": 0.0127105712890625, + "learning_rate": 1.9673626802378978e-05, + "loss": 0.0005, + "num_tokens": 328311874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17376461551591704, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.168803244042394e-08, + "kl": 0.012359619140625, + "learning_rate": 1.9672115382431796e-05, + "loss": 0.0005, + "num_tokens": 328883698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17393530767261245, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.03491387884916e-08, + "kl": 0.0118560791015625, + "learning_rate": 1.96706005292328e-05, + "loss": 0.0005, + "num_tokens": 329450050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17410599982930786, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.002237755553435e-08, + "kl": 0.011871337890625, + "learning_rate": 1.9669082243319703e-05, + "loss": 0.0005, + "num_tokens": 330011426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17427669198600323, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.162641126192217e-08, + "kl": 0.012542724609375, + "learning_rate": 1.9667560525231444e-05, + "loss": 0.0005, + "num_tokens": 330577026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17444738414269864, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.600686173481519e-05, + "kl": 0.0118560791015625, + "learning_rate": 1.9666035375508178e-05, + "loss": 0.0005, + "num_tokens": 331144930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17461807629939405, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.656843489342958e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.966450679469128e-05, + "loss": 0.0005, + "num_tokens": 331707666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17478876845608946, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.063941344216956e-05, + "kl": 0.0127716064453125, + "learning_rate": 1.9662974783323334e-05, + "loss": 0.0005, + "num_tokens": 332274978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17495946061278483, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.89863561394413e-08, + "kl": 0.0121002197265625, + "learning_rate": 1.9661439341948154e-05, + "loss": 0.0005, + "num_tokens": 332840978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17513015276948024, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5105222732125886e-05, + "kl": 0.012115478515625, + "learning_rate": 1.9659900471110765e-05, + "loss": 0.0005, + "num_tokens": 333413522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17530084492617565, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.129270965922156e-05, + "kl": 0.012939453125, + "learning_rate": 1.9658358171357418e-05, + "loss": 0.0005, + "num_tokens": 333980898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17547153708287105, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.997056299005278e-08, + "kl": 0.0122833251953125, + "learning_rate": 1.9656812443235557e-05, + "loss": 0.0005, + "num_tokens": 334546722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17564222923956643, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.5987373513424365e-05, + "kl": 0.0123138427734375, + "learning_rate": 1.9655263287293874e-05, + "loss": 0.0005, + "num_tokens": 335123346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17581292139626184, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.7016620350265066e-05, + "kl": 0.0121307373046875, + "learning_rate": 1.965371070408226e-05, + "loss": 0.0005, + "num_tokens": 335684738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17598361355295725, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2750359743099815e-05, + "kl": 0.01251220703125, + "learning_rate": 1.9652154694151827e-05, + "loss": 0.0005, + "num_tokens": 336252578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17615430570965265, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4471755906257083e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.9650595258054897e-05, + "loss": 0.0005, + "num_tokens": 336822722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17632499786634803, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2719934214491672e-05, + "kl": 0.011871337890625, + "learning_rate": 1.9649032396345017e-05, + "loss": 0.0005, + "num_tokens": 337391906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17649569002304344, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.892372663324893e-08, + "kl": 0.0123748779296875, + "learning_rate": 1.9647466109576947e-05, + "loss": 0.0005, + "num_tokens": 337954226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17666638217973885, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.143369216214221e-08, + "kl": 0.012847900390625, + "learning_rate": 1.9645896398306662e-05, + "loss": 0.0005, + "num_tokens": 338529266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17683707433643425, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3179275076135467e-05, + "kl": 0.011932373046875, + "learning_rate": 1.9644323263091348e-05, + "loss": 0.0005, + "num_tokens": 339095138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17700776649312963, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.54294311750016e-08, + "kl": 0.0124969482421875, + "learning_rate": 1.964274670448942e-05, + "loss": 0.0005, + "num_tokens": 339665970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17717845864982504, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.221816515900195e-08, + "kl": 0.012908935546875, + "learning_rate": 1.9641166723060495e-05, + "loss": 0.0005, + "num_tokens": 340231282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17734915080652044, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.58873878754154e-05, + "kl": 0.0121002197265625, + "learning_rate": 1.963958331936541e-05, + "loss": 0.0005, + "num_tokens": 340799554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17751984296321585, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.171684914931625e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.9637996493966213e-05, + "loss": 0.0005, + "num_tokens": 341364834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17769053511991123, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.54713181538364e-05, + "kl": 0.01275634765625, + "learning_rate": 1.9636406247426178e-05, + "loss": 0.0005, + "num_tokens": 341938306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17786122727660664, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.228057282455396e-08, + "kl": 0.0120391845703125, + "learning_rate": 1.9634812580309773e-05, + "loss": 0.0005, + "num_tokens": 342502242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17803191943330204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10931907091595043, + "kl": 0.0173187255859375, + "learning_rate": 1.9633215493182702e-05, + "loss": 0.0007, + "num_tokens": 343070930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17820261158999745, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.667285058069984e-05, + "kl": 0.011962890625, + "learning_rate": 1.963161498661187e-05, + "loss": 0.0005, + "num_tokens": 343633890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17837330374669283, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.263696521443505e-08, + "kl": 0.01226806640625, + "learning_rate": 1.9630011061165398e-05, + "loss": 0.0005, + "num_tokens": 344199682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17854399590338824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006354407878443677, + "kl": 0.0120391845703125, + "learning_rate": 1.9628403717412622e-05, + "loss": 0.0005, + "num_tokens": 344761986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17871468806008364, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.146609979776372e-05, + "kl": 0.0123291015625, + "learning_rate": 1.9626792955924096e-05, + "loss": 0.0005, + "num_tokens": 345330114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17888538021677905, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.658510832130256e-05, + "kl": 0.0121917724609375, + "learning_rate": 1.962517877727157e-05, + "loss": 0.0005, + "num_tokens": 345890786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17905607237347443, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.73394761555558e-05, + "kl": 0.0122833251953125, + "learning_rate": 1.9623561182028034e-05, + "loss": 0.0005, + "num_tokens": 346455810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17922676453016984, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1729071316534714e-05, + "kl": 0.012542724609375, + "learning_rate": 1.9621940170767665e-05, + "loss": 0.0005, + "num_tokens": 347026114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17939745668686524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037603076562160273, + "kl": 0.01239013671875, + "learning_rate": 1.962031574406587e-05, + "loss": 0.0005, + "num_tokens": 347593282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17956814884356065, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.370532425270898e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.9618687902499257e-05, + "loss": 0.0005, + "num_tokens": 348159042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17973884100025603, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.345740263454693e-05, + "kl": 0.01214599609375, + "learning_rate": 1.9617056646645655e-05, + "loss": 0.0005, + "num_tokens": 348739554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17990953315695143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006204225777053206, + "kl": 0.01263427734375, + "learning_rate": 1.96154219770841e-05, + "loss": 0.0005, + "num_tokens": 349303426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18008022531364684, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.465126298337018e-05, + "kl": 0.0125274658203125, + "learning_rate": 1.961378389439484e-05, + "loss": 0.0005, + "num_tokens": 349866066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18025091747034225, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.389681531954083e-05, + "kl": 0.0124053955078125, + "learning_rate": 1.9612142399159336e-05, + "loss": 0.0005, + "num_tokens": 350429010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18042160962703763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001746591281480867, + "kl": 0.01220703125, + "learning_rate": 1.9610497491960256e-05, + "loss": 0.0005, + "num_tokens": 350993986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18059230178373303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006567415791305393, + "kl": 0.012237548828125, + "learning_rate": 1.9608849173381483e-05, + "loss": 0.0005, + "num_tokens": 351555682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18076299394042844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016835948107918745, + "kl": 0.012359619140625, + "learning_rate": 1.9607197444008118e-05, + "loss": 0.0005, + "num_tokens": 352123506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18093368609712385, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.4183854232481416e-05, + "kl": 0.012054443359375, + "learning_rate": 1.9605542304426456e-05, + "loss": 0.0005, + "num_tokens": 352688994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18110437825381923, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.8526448274637e-05, + "kl": 0.0124053955078125, + "learning_rate": 1.9603883755224023e-05, + "loss": 0.0005, + "num_tokens": 353257490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18127507041051463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010656167517481765, + "kl": 0.012359619140625, + "learning_rate": 1.960222179698953e-05, + "loss": 0.0005, + "num_tokens": 353823778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18144576256721004, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.618334152524561e-05, + "kl": 0.012481689453125, + "learning_rate": 1.9600556430312923e-05, + "loss": 0.0005, + "num_tokens": 354394210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18161645472390545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014605871278590816, + "kl": 0.01263427734375, + "learning_rate": 1.9598887655785343e-05, + "loss": 0.0005, + "num_tokens": 354961986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18178714688060083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034337667974025014, + "kl": 0.012298583984375, + "learning_rate": 1.9597215473999146e-05, + "loss": 0.0005, + "num_tokens": 355527026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18195783903729623, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.308184137733167e-05, + "kl": 0.01171875, + "learning_rate": 1.9595539885547894e-05, + "loss": 0.0005, + "num_tokens": 356091106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18212853119399164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001760101239484766, + "kl": 0.012298583984375, + "learning_rate": 1.959386089102636e-05, + "loss": 0.0005, + "num_tokens": 356655410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18229922335068705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03525592143112874, + "kl": 0.015228271484375, + "learning_rate": 1.959217849103053e-05, + "loss": 0.0006, + "num_tokens": 357214802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18246991550738242, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.0172851412193854e-05, + "kl": 0.012176513671875, + "learning_rate": 1.9590492686157595e-05, + "loss": 0.0005, + "num_tokens": 357779778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18264060766407783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048789851830104326, + "kl": 0.0126190185546875, + "learning_rate": 1.958880347700595e-05, + "loss": 0.0005, + "num_tokens": 358343634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18281129982077324, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.108823180724921e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.958711086417521e-05, + "loss": 0.0005, + "num_tokens": 358905810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18298199197746864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018313753780232242, + "kl": 0.0125274658203125, + "learning_rate": 1.9585414848266185e-05, + "loss": 0.0005, + "num_tokens": 359491730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18315268413416402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012182868098607501, + "kl": 0.0118408203125, + "learning_rate": 1.9583715429880904e-05, + "loss": 0.0005, + "num_tokens": 360051170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18332337629085943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010363052781638365, + "kl": 0.0120697021484375, + "learning_rate": 1.9582012609622598e-05, + "loss": 0.0005, + "num_tokens": 360617762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18349406844755484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043752077959692234, + "kl": 0.0120086669921875, + "learning_rate": 1.9580306388095707e-05, + "loss": 0.0005, + "num_tokens": 361182018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18366476060425024, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.387270681014923e-05, + "kl": 0.0125274658203125, + "learning_rate": 1.9578596765905875e-05, + "loss": 0.0005, + "num_tokens": 361742018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18383545276094562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011409145829741583, + "kl": 0.0128631591796875, + "learning_rate": 1.9576883743659958e-05, + "loss": 0.0005, + "num_tokens": 362307010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18400614491764103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002242553832093719, + "kl": 0.0122528076171875, + "learning_rate": 1.957516732196602e-05, + "loss": 0.0005, + "num_tokens": 362872402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18417683707433644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010567644878871285, + "kl": 0.0122528076171875, + "learning_rate": 1.9573447501433327e-05, + "loss": 0.0005, + "num_tokens": 363433970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18434752923103184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027547376093876917, + "kl": 0.0118560791015625, + "learning_rate": 1.957172428267235e-05, + "loss": 0.0005, + "num_tokens": 363999586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18451822138772722, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.038559675766106e-05, + "kl": 0.0123443603515625, + "learning_rate": 1.9569997666294774e-05, + "loss": 0.0005, + "num_tokens": 364569250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18468891354442263, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.035681362656195e-05, + "kl": 0.01251220703125, + "learning_rate": 1.9568267652913484e-05, + "loss": 0.0005, + "num_tokens": 365134258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18485960570111803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000584515110857474, + "kl": 0.0125579833984375, + "learning_rate": 1.956653424314257e-05, + "loss": 0.0005, + "num_tokens": 365701346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18503029785781344, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.104168645112458e-05, + "kl": 0.0124359130859375, + "learning_rate": 1.9564797437597334e-05, + "loss": 0.0005, + "num_tokens": 366271538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18520099001450882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014780646103568017, + "kl": 0.012237548828125, + "learning_rate": 1.9563057236894276e-05, + "loss": 0.0005, + "num_tokens": 366835586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18537168217120423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008407633542080585, + "kl": 0.013214111328125, + "learning_rate": 1.9561313641651104e-05, + "loss": 0.0005, + "num_tokens": 367407314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18554237432789963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004278212692051787, + "kl": 0.012725830078125, + "learning_rate": 1.9559566652486734e-05, + "loss": 0.0005, + "num_tokens": 367969234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18571306648459504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022258834432312447, + "kl": 0.01220703125, + "learning_rate": 1.9557816270021284e-05, + "loss": 0.0005, + "num_tokens": 368533906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18588375864129042, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.85233896538427e-05, + "kl": 0.0121307373046875, + "learning_rate": 1.9556062494876072e-05, + "loss": 0.0005, + "num_tokens": 369104034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18605445079798583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007665861014403946, + "kl": 0.0128021240234375, + "learning_rate": 1.9554305327673632e-05, + "loss": 0.0005, + "num_tokens": 369670546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18622514295468123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008983045152701206, + "kl": 0.01348876953125, + "learning_rate": 1.9552544769037693e-05, + "loss": 0.0005, + "num_tokens": 370231922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18639583511137664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008098691924084937, + "kl": 0.0123138427734375, + "learning_rate": 1.9550780819593184e-05, + "loss": 0.0005, + "num_tokens": 370802866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18656652726807202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019928979440244723, + "kl": 0.012451171875, + "learning_rate": 1.9549013479966247e-05, + "loss": 0.0005, + "num_tokens": 371364258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18673721942476743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048243222881718794, + "kl": 0.012542724609375, + "learning_rate": 1.9547242750784224e-05, + "loss": 0.0005, + "num_tokens": 371934370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18690791158146283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009051884279741421, + "kl": 0.01214599609375, + "learning_rate": 1.9545468632675665e-05, + "loss": 0.0005, + "num_tokens": 372506754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18707860373815824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007333542954201964, + "kl": 0.012359619140625, + "learning_rate": 1.9543691126270308e-05, + "loss": 0.0005, + "num_tokens": 373074002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18724929589485362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009423875492403731, + "kl": 0.01239013671875, + "learning_rate": 1.954191023219911e-05, + "loss": 0.0005, + "num_tokens": 373634962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18741998805154902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005188719600817539, + "kl": 0.0128021240234375, + "learning_rate": 1.9540125951094226e-05, + "loss": 0.0005, + "num_tokens": 374196914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18759068020824443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012455081780588463, + "kl": 0.01214599609375, + "learning_rate": 1.9538338283589005e-05, + "loss": 0.0005, + "num_tokens": 374761890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18776137236493984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011831030859754251, + "kl": 0.0126800537109375, + "learning_rate": 1.9536547230318006e-05, + "loss": 0.0005, + "num_tokens": 375325618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18793206452163522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000732177995191362, + "kl": 0.0127410888671875, + "learning_rate": 1.9534752791916995e-05, + "loss": 0.0005, + "num_tokens": 375893938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18810275667833062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013438303937898142, + "kl": 0.0121917724609375, + "learning_rate": 1.9532954969022922e-05, + "loss": 0.0005, + "num_tokens": 376460562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18827344883502603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016439009884379311, + "kl": 0.01300048828125, + "learning_rate": 1.9531153762273957e-05, + "loss": 0.0005, + "num_tokens": 377026722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18844414099172144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008359843365157476, + "kl": 0.012481689453125, + "learning_rate": 1.9529349172309463e-05, + "loss": 0.0005, + "num_tokens": 377584450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18861483314841684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009749683671024393, + "kl": 0.012725830078125, + "learning_rate": 1.952754119977e-05, + "loss": 0.0005, + "num_tokens": 378151010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18878552530511222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001262152394966673, + "kl": 0.0123138427734375, + "learning_rate": 1.9525729845297337e-05, + "loss": 0.0005, + "num_tokens": 378714754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18895621746180763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001042546887794352, + "kl": 0.0124359130859375, + "learning_rate": 1.9523915109534437e-05, + "loss": 0.0005, + "num_tokens": 379283314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18912690961850304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007644953469904003, + "kl": 0.0126495361328125, + "learning_rate": 1.952209699312547e-05, + "loss": 0.0005, + "num_tokens": 379848386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18929760177519844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011926403641431393, + "kl": 0.0128021240234375, + "learning_rate": 1.9520275496715796e-05, + "loss": 0.0005, + "num_tokens": 380413410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18946829393189382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009009437705604803, + "kl": 0.01251220703125, + "learning_rate": 1.9518450620951982e-05, + "loss": 0.0005, + "num_tokens": 380985490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18963898608858923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006080587858819023, + "kl": 0.0131683349609375, + "learning_rate": 1.9516622366481794e-05, + "loss": 0.0005, + "num_tokens": 381549282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18980967824528464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005535324578343898, + "kl": 0.011932373046875, + "learning_rate": 1.9514790733954195e-05, + "loss": 0.0005, + "num_tokens": 382118978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18998037040198004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012963761547062772, + "kl": 0.0121307373046875, + "learning_rate": 1.9512955724019353e-05, + "loss": 0.0005, + "num_tokens": 382678834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19015106255867542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008046964610194925, + "kl": 0.0124664306640625, + "learning_rate": 1.9511117337328625e-05, + "loss": 0.0005, + "num_tokens": 383247522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19032175471537083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004293399294946299, + "kl": 0.012298583984375, + "learning_rate": 1.9509275574534575e-05, + "loss": 0.0005, + "num_tokens": 383813682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19049244687206623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003890598069567184, + "kl": 0.0124969482421875, + "learning_rate": 1.950743043629096e-05, + "loss": 0.0005, + "num_tokens": 384378434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19066313902876164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009047078950641097, + "kl": 0.012481689453125, + "learning_rate": 1.9505581923252743e-05, + "loss": 0.0005, + "num_tokens": 384944914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19083383118545702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011237101061628912, + "kl": 0.0125274658203125, + "learning_rate": 1.9503730036076068e-05, + "loss": 0.0005, + "num_tokens": 385507074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19100452334215243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008343162424226475, + "kl": 0.0130615234375, + "learning_rate": 1.95018747754183e-05, + "loss": 0.0005, + "num_tokens": 386069522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19117521549884783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006911436758421927, + "kl": 0.0128173828125, + "learning_rate": 1.9500016141937982e-05, + "loss": 0.0005, + "num_tokens": 386634706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19134590765554324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025080232296794715, + "kl": 0.012603759765625, + "learning_rate": 1.9498154136294865e-05, + "loss": 0.0005, + "num_tokens": 387199266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19151659981223862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014660950859409716, + "kl": 0.013763427734375, + "learning_rate": 1.9496288759149896e-05, + "loss": 0.0005, + "num_tokens": 387761970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19168729196893403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008630553443319344, + "kl": 0.0130767822265625, + "learning_rate": 1.949442001116521e-05, + "loss": 0.0005, + "num_tokens": 388331010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19185798412562943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007061729120833661, + "kl": 0.0131683349609375, + "learning_rate": 1.949254789300415e-05, + "loss": 0.0005, + "num_tokens": 388897906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19202867628232484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005673565744833615, + "kl": 0.0128173828125, + "learning_rate": 1.9490672405331252e-05, + "loss": 0.0005, + "num_tokens": 389462658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19219936843902022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001573096850374748, + "kl": 0.0122528076171875, + "learning_rate": 1.9488793548812243e-05, + "loss": 0.0005, + "num_tokens": 390027618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19237006059571563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010043408626182033, + "kl": 0.01202392578125, + "learning_rate": 1.948691132411405e-05, + "loss": 0.0005, + "num_tokens": 390598978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19254075275241103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029934583869265077, + "kl": 0.01190185546875, + "learning_rate": 1.948502573190479e-05, + "loss": 0.0005, + "num_tokens": 391168978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19271144490910644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047993023089599806, + "kl": 0.0127410888671875, + "learning_rate": 1.9483136772853788e-05, + "loss": 0.0005, + "num_tokens": 391734850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19288213706580182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000619513825019532, + "kl": 0.012481689453125, + "learning_rate": 1.9481244447631552e-05, + "loss": 0.0005, + "num_tokens": 392297826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19305282922249722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005928521386842787, + "kl": 0.0125579833984375, + "learning_rate": 1.9479348756909797e-05, + "loss": 0.0005, + "num_tokens": 392864498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19322352137919263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002580451030445199, + "kl": 0.013763427734375, + "learning_rate": 1.9477449701361412e-05, + "loss": 0.0006, + "num_tokens": 393434370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19339421353588804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007336312130898098, + "kl": 0.0122833251953125, + "learning_rate": 1.94755472816605e-05, + "loss": 0.0005, + "num_tokens": 394005330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19356490569258342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009536638699563089, + "kl": 0.0122833251953125, + "learning_rate": 1.947364149848235e-05, + "loss": 0.0005, + "num_tokens": 394573810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19373559784927882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008614823356809954, + "kl": 0.0122528076171875, + "learning_rate": 1.9471732352503447e-05, + "loss": 0.0005, + "num_tokens": 395139090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19390629000597423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007045699162382339, + "kl": 0.01226806640625, + "learning_rate": 1.946981984440147e-05, + "loss": 0.0005, + "num_tokens": 395701314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19407698216266964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008301165088386747, + "kl": 0.012939453125, + "learning_rate": 1.946790397485529e-05, + "loss": 0.0005, + "num_tokens": 396264226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19424767431936502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013590912064222863, + "kl": 0.01300048828125, + "learning_rate": 1.946598474454497e-05, + "loss": 0.0005, + "num_tokens": 396828434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19441836647606042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005780706304241161, + "kl": 0.0124053955078125, + "learning_rate": 1.9464062154151765e-05, + "loss": 0.0005, + "num_tokens": 397393218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19458905863275583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012464447764344924, + "kl": 0.0127105712890625, + "learning_rate": 1.9462136204358133e-05, + "loss": 0.0005, + "num_tokens": 397955938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19475975078945124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005125655134175882, + "kl": 0.0127716064453125, + "learning_rate": 1.9460206895847707e-05, + "loss": 0.0005, + "num_tokens": 398527394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19493044294614661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011891140617647767, + "kl": 0.0126190185546875, + "learning_rate": 1.9458274229305333e-05, + "loss": 0.0005, + "num_tokens": 399094706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19510113510284202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008841018917492982, + "kl": 0.0123138427734375, + "learning_rate": 1.9456338205417026e-05, + "loss": 0.0005, + "num_tokens": 399656930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19527182725953743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008504246569356632, + "kl": 0.0128021240234375, + "learning_rate": 1.9454398824870014e-05, + "loss": 0.0005, + "num_tokens": 400225522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19544251941623283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010577557694982289, + "kl": 0.012298583984375, + "learning_rate": 1.94524560883527e-05, + "loss": 0.0005, + "num_tokens": 400786450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19561321157292821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006882084788763286, + "kl": 0.012542724609375, + "learning_rate": 1.945050999655469e-05, + "loss": 0.0005, + "num_tokens": 401356258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19578390372962362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010924973047272421, + "kl": 0.0131378173828125, + "learning_rate": 1.9448560550166777e-05, + "loss": 0.0005, + "num_tokens": 401920354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19595459588631903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038400715271838426, + "kl": 0.0128021240234375, + "learning_rate": 1.9446607749880942e-05, + "loss": 0.0005, + "num_tokens": 402487282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19612528804301443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009616006069271627, + "kl": 0.0121917724609375, + "learning_rate": 1.9444651596390355e-05, + "loss": 0.0005, + "num_tokens": 403049954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1962959801997098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001074482234868787, + "kl": 0.0125732421875, + "learning_rate": 1.9442692090389387e-05, + "loss": 0.0005, + "num_tokens": 403615570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19646667235640522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006491487637768793, + "kl": 0.012542724609375, + "learning_rate": 1.9440729232573588e-05, + "loss": 0.0005, + "num_tokens": 404185586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19663736451310063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022502829770315038, + "kl": 0.0129241943359375, + "learning_rate": 1.9438763023639703e-05, + "loss": 0.0005, + "num_tokens": 404754738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19680805666979603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006342163025989813, + "kl": 0.0123748779296875, + "learning_rate": 1.9436793464285664e-05, + "loss": 0.0005, + "num_tokens": 405319346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1969787488264914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007072217759633748, + "kl": 0.0123291015625, + "learning_rate": 1.9434820555210592e-05, + "loss": 0.0005, + "num_tokens": 405888738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19714944098318682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005581314159333476, + "kl": 0.012481689453125, + "learning_rate": 1.9432844297114802e-05, + "loss": 0.0005, + "num_tokens": 406453586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19732013313988223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007854048348202293, + "kl": 0.0127105712890625, + "learning_rate": 1.9430864690699792e-05, + "loss": 0.0005, + "num_tokens": 407019378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19749082529657763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004929799596842098, + "kl": 0.01263427734375, + "learning_rate": 1.942888173666825e-05, + "loss": 0.0005, + "num_tokens": 407584850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.197661517453273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006302124307969836, + "kl": 0.01190185546875, + "learning_rate": 1.9426895435724056e-05, + "loss": 0.0005, + "num_tokens": 408152946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19783220960996842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000831715503054808, + "kl": 0.0121612548828125, + "learning_rate": 1.9424905788572276e-05, + "loss": 0.0005, + "num_tokens": 408718482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19800290176666382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007463009526597421, + "kl": 0.0122528076171875, + "learning_rate": 1.9422912795919157e-05, + "loss": 0.0005, + "num_tokens": 409286130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19817359392335923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007857685190069912, + "kl": 0.0124969482421875, + "learning_rate": 1.9420916458472144e-05, + "loss": 0.0005, + "num_tokens": 409857810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1983442860800546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007921738377274678, + "kl": 0.0128936767578125, + "learning_rate": 1.9418916776939865e-05, + "loss": 0.0005, + "num_tokens": 410422530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19851497823675002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046733807565109306, + "kl": 0.0124969482421875, + "learning_rate": 1.941691375203213e-05, + "loss": 0.0005, + "num_tokens": 410992914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19868567039344542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004555211889545834, + "kl": 0.013092041015625, + "learning_rate": 1.9414907384459952e-05, + "loss": 0.0005, + "num_tokens": 411562802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19885636255014083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028248358309246476, + "kl": 0.013580322265625, + "learning_rate": 1.9412897674935504e-05, + "loss": 0.0005, + "num_tokens": 412126178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1990270547068362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000425771142152826, + "kl": 0.012451171875, + "learning_rate": 1.941088462417217e-05, + "loss": 0.0005, + "num_tokens": 412688402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19919774686353162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034879973387251056, + "kl": 0.012481689453125, + "learning_rate": 1.9408868232884508e-05, + "loss": 0.0005, + "num_tokens": 413254850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19936843902022702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008007635440739338, + "kl": 0.0140838623046875, + "learning_rate": 1.9406848501788264e-05, + "loss": 0.0006, + "num_tokens": 413824562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19953913117692243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008627673275867901, + "kl": 0.012359619140625, + "learning_rate": 1.940482543160037e-05, + "loss": 0.0005, + "num_tokens": 414386258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1997098233336178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000795053652476658, + "kl": 0.0128173828125, + "learning_rate": 1.9402799023038938e-05, + "loss": 0.0005, + "num_tokens": 414948082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.19988051549031322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006602044763393071, + "kl": 0.0125274658203125, + "learning_rate": 1.940076927682328e-05, + "loss": 0.0005, + "num_tokens": 415516818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20005120764700862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008527644559280373, + "kl": 0.0128021240234375, + "learning_rate": 1.9398736193673873e-05, + "loss": 0.0005, + "num_tokens": 416080274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20022189980370403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005734160256289179, + "kl": 0.0128021240234375, + "learning_rate": 1.9396699774312395e-05, + "loss": 0.0005, + "num_tokens": 416649858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2003925919603994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008066867587173404, + "kl": 0.0129852294921875, + "learning_rate": 1.9394660019461696e-05, + "loss": 0.0005, + "num_tokens": 417216914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20056328411709481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006617641350960587, + "kl": 0.0122528076171875, + "learning_rate": 1.939261692984582e-05, + "loss": 0.0005, + "num_tokens": 417777858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20073397627379022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007331104886824906, + "kl": 0.012298583984375, + "learning_rate": 1.939057050618999e-05, + "loss": 0.0005, + "num_tokens": 418346850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20090466843048563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012287385706546085, + "kl": 0.0122222900390625, + "learning_rate": 1.9388520749220605e-05, + "loss": 0.0005, + "num_tokens": 418911714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.201075360587181, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006191438805573302, + "kl": 0.0118865966796875, + "learning_rate": 1.9386467659665266e-05, + "loss": 0.0005, + "num_tokens": 419479106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2012460527438764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044898820960405925, + "kl": 0.0118255615234375, + "learning_rate": 1.9384411238252733e-05, + "loss": 0.0005, + "num_tokens": 420040498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20141674490057182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000340101862268476, + "kl": 0.013153076171875, + "learning_rate": 1.9382351485712973e-05, + "loss": 0.0005, + "num_tokens": 420600786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20158743705726723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008627405600120587, + "kl": 0.0126800537109375, + "learning_rate": 1.9380288402777118e-05, + "loss": 0.0005, + "num_tokens": 421166194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2017581292139626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008329913082864418, + "kl": 0.012664794921875, + "learning_rate": 1.9378221990177488e-05, + "loss": 0.0005, + "num_tokens": 421736658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.201928821370658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003027346852838985, + "kl": 0.012603759765625, + "learning_rate": 1.9376152248647587e-05, + "loss": 0.0005, + "num_tokens": 422299602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20209951352735342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007777686226194874, + "kl": 0.01287841796875, + "learning_rate": 1.9374079178922097e-05, + "loss": 0.0005, + "num_tokens": 422864546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20227020568404883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007736640622452194, + "kl": 0.0120391845703125, + "learning_rate": 1.9372002781736884e-05, + "loss": 0.0005, + "num_tokens": 423431106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2024408978407442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016501090615838286, + "kl": 0.01251220703125, + "learning_rate": 1.9369923057828994e-05, + "loss": 0.0005, + "num_tokens": 424000498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2026115899974396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035414574050094354, + "kl": 0.012786865234375, + "learning_rate": 1.9367840007936652e-05, + "loss": 0.0005, + "num_tokens": 424562498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20278228215413502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007618162472045035, + "kl": 0.0127105712890625, + "learning_rate": 1.936575363279927e-05, + "loss": 0.0005, + "num_tokens": 425125298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20295297431083043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006216113032725799, + "kl": 0.0120697021484375, + "learning_rate": 1.9363663933157425e-05, + "loss": 0.0005, + "num_tokens": 425687458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2031236664675258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006575479211636246, + "kl": 0.012420654296875, + "learning_rate": 1.9361570909752897e-05, + "loss": 0.0005, + "num_tokens": 426254322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2032943586242212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002152816359429612, + "kl": 0.012359619140625, + "learning_rate": 1.9359474563328632e-05, + "loss": 0.0005, + "num_tokens": 426815266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20346505078091662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00564552479090654, + "kl": 0.013031005859375, + "learning_rate": 1.935737489462875e-05, + "loss": 0.0005, + "num_tokens": 427374482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20363574293761202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006324748577331501, + "kl": 0.0121307373046875, + "learning_rate": 1.9355271904398563e-05, + "loss": 0.0005, + "num_tokens": 427936546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2038064350943074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007464829151101112, + "kl": 0.0125579833984375, + "learning_rate": 1.935316559338456e-05, + "loss": 0.0005, + "num_tokens": 428500594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2039771272510028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004984483653607991, + "kl": 0.0124969482421875, + "learning_rate": 1.9351055962334398e-05, + "loss": 0.0005, + "num_tokens": 429070530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20414781940769822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005168182179770298, + "kl": 0.012237548828125, + "learning_rate": 1.9348943011996922e-05, + "loss": 0.0005, + "num_tokens": 429637234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20431851156439362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007198998351068055, + "kl": 0.0130615234375, + "learning_rate": 1.9346826743122158e-05, + "loss": 0.0005, + "num_tokens": 430204210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.204489203721089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002688635377949183, + "kl": 0.0125885009765625, + "learning_rate": 1.9344707156461297e-05, + "loss": 0.0005, + "num_tokens": 430777634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2046598958777844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000742052705385659, + "kl": 0.01220703125, + "learning_rate": 1.9342584252766727e-05, + "loss": 0.0005, + "num_tokens": 431341314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20483058803447982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002195477362359335, + "kl": 0.012237548828125, + "learning_rate": 1.9340458032791987e-05, + "loss": 0.0005, + "num_tokens": 431911794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20500128019117522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000281098364757962, + "kl": 0.012115478515625, + "learning_rate": 1.933832849729182e-05, + "loss": 0.0005, + "num_tokens": 432475490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20517197234787063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012248163665313594, + "kl": 0.0123138427734375, + "learning_rate": 1.933619564702213e-05, + "loss": 0.0005, + "num_tokens": 433042130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.205342664504566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005648096346178597, + "kl": 0.0126190185546875, + "learning_rate": 1.9334059482740004e-05, + "loss": 0.0005, + "num_tokens": 433607874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20551335666126141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000721010248573833, + "kl": 0.0126495361328125, + "learning_rate": 1.93319200052037e-05, + "loss": 0.0005, + "num_tokens": 434175266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20568404881795682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008541982602444027, + "kl": 0.0127105712890625, + "learning_rate": 1.9329777215172657e-05, + "loss": 0.0005, + "num_tokens": 434739586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20585474097465223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005415982601551859, + "kl": 0.0124053955078125, + "learning_rate": 1.9327631113407492e-05, + "loss": 0.0005, + "num_tokens": 435304322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2060254331313476, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.225208025444348e-05, + "kl": 0.011810302734375, + "learning_rate": 1.9325481700669985e-05, + "loss": 0.0005, + "num_tokens": 435871298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20619612528804301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018955023142374506, + "kl": 0.012451171875, + "learning_rate": 1.932332897772311e-05, + "loss": 0.0005, + "num_tokens": 436437490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20636681744473842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009955220095284223, + "kl": 0.0125732421875, + "learning_rate": 1.9321172945330996e-05, + "loss": 0.0005, + "num_tokens": 437000626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20653750960143383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011343228482328554, + "kl": 0.0128326416015625, + "learning_rate": 1.9319013604258964e-05, + "loss": 0.0005, + "num_tokens": 437564818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2067082017581292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005446761258669491, + "kl": 0.012939453125, + "learning_rate": 1.93168509552735e-05, + "loss": 0.0005, + "num_tokens": 438131074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2068788939148246, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.859248692362373e-05, + "kl": 0.012176513671875, + "learning_rate": 1.9314684999142263e-05, + "loss": 0.0005, + "num_tokens": 438696802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20704958607152002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01958445671034562, + "kl": 0.0132293701171875, + "learning_rate": 1.9312515736634094e-05, + "loss": 0.0005, + "num_tokens": 439259442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20722027822821543, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.449038987955456e-05, + "kl": 0.012481689453125, + "learning_rate": 1.9310343168519e-05, + "loss": 0.0005, + "num_tokens": 439825234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2073909703849108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003613473766277323, + "kl": 0.0120086669921875, + "learning_rate": 1.9308167295568165e-05, + "loss": 0.0005, + "num_tokens": 440393250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2075616625416062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042707894689089827, + "kl": 0.0124969482421875, + "learning_rate": 1.930598811855395e-05, + "loss": 0.0005, + "num_tokens": 440959122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20773235469830162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006503404742300058, + "kl": 0.0124664306640625, + "learning_rate": 1.9303805638249873e-05, + "loss": 0.0005, + "num_tokens": 441521922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20790304685499703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008871149663220975, + "kl": 0.0130462646484375, + "learning_rate": 1.9301619855430646e-05, + "loss": 0.0005, + "num_tokens": 442090082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2080737390116924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005775712260743485, + "kl": 0.0125579833984375, + "learning_rate": 1.929943077087214e-05, + "loss": 0.0005, + "num_tokens": 442666674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2082444311683878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010776111389375505, + "kl": 0.012847900390625, + "learning_rate": 1.92972383853514e-05, + "loss": 0.0005, + "num_tokens": 443232386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20841512332508322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01599629708264779, + "kl": 0.0141448974609375, + "learning_rate": 1.9295042699646645e-05, + "loss": 0.0006, + "num_tokens": 443800226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20858581548177862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011498935365223625, + "kl": 0.0124664306640625, + "learning_rate": 1.9292843714537262e-05, + "loss": 0.0005, + "num_tokens": 444365714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.208756507638474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011588661027675438, + "kl": 0.0131988525390625, + "learning_rate": 1.9290641430803813e-05, + "loss": 0.0005, + "num_tokens": 444937986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2089271997951694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000841151531871347, + "kl": 0.0121002197265625, + "learning_rate": 1.928843584922803e-05, + "loss": 0.0005, + "num_tokens": 445504162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20909789195186482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015870795080886266, + "kl": 0.012725830078125, + "learning_rate": 1.9286226970592814e-05, + "loss": 0.0005, + "num_tokens": 446068210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20926858410856022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001744747160004113, + "kl": 0.013031005859375, + "learning_rate": 1.9284014795682234e-05, + "loss": 0.0005, + "num_tokens": 446630098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2094392762652556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013288129999986036, + "kl": 0.013153076171875, + "learning_rate": 1.928179932528154e-05, + "loss": 0.0005, + "num_tokens": 447196562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.209609968421951, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014511613608829367, + "kl": 0.012847900390625, + "learning_rate": 1.9279580560177135e-05, + "loss": 0.0005, + "num_tokens": 447762434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20978066057864642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002204399120060342, + "kl": 0.0130615234375, + "learning_rate": 1.9277358501156606e-05, + "loss": 0.0005, + "num_tokens": 448341698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20995135273534182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005475685270759043, + "kl": 0.013427734375, + "learning_rate": 1.9275133149008703e-05, + "loss": 0.0005, + "num_tokens": 448906018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2101220448920372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00200188275127911, + "kl": 0.012969970703125, + "learning_rate": 1.9272904504523344e-05, + "loss": 0.0005, + "num_tokens": 449475266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2102927370487326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022090938121452825, + "kl": 0.0136260986328125, + "learning_rate": 1.9270672568491623e-05, + "loss": 0.0005, + "num_tokens": 450044034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21046342920542802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001620889377652405, + "kl": 0.0133209228515625, + "learning_rate": 1.9268437341705784e-05, + "loss": 0.0005, + "num_tokens": 450615394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21063412136212342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018886547216249905, + "kl": 0.012969970703125, + "learning_rate": 1.926619882495927e-05, + "loss": 0.0005, + "num_tokens": 451181106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2108048135188188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007266981752641222, + "kl": 0.0130767822265625, + "learning_rate": 1.9263957019046653e-05, + "loss": 0.0005, + "num_tokens": 451745842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2109755056755142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018357733706916596, + "kl": 0.0125579833984375, + "learning_rate": 1.9261711924763713e-05, + "loss": 0.0005, + "num_tokens": 452304434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21114619783220961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018294950035950264, + "kl": 0.0129547119140625, + "learning_rate": 1.9259463542907366e-05, + "loss": 0.0005, + "num_tokens": 452873554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21131688998890502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015994294554768824, + "kl": 0.01263427734375, + "learning_rate": 1.9257211874275708e-05, + "loss": 0.0005, + "num_tokens": 453437010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2114875821456004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014028086691937386, + "kl": 0.0128326416015625, + "learning_rate": 1.9254956919668e-05, + "loss": 0.0005, + "num_tokens": 454002258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2116582743022958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016381100575732714, + "kl": 0.0129852294921875, + "learning_rate": 1.9252698679884667e-05, + "loss": 0.0005, + "num_tokens": 454568690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2118289664589912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009223256513231815, + "kl": 0.0124053955078125, + "learning_rate": 1.925043715572731e-05, + "loss": 0.0005, + "num_tokens": 455134530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21199965861568662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011220100704766522, + "kl": 0.012908935546875, + "learning_rate": 1.9248172347998685e-05, + "loss": 0.0005, + "num_tokens": 455700770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.212170350772382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002762935708531682, + "kl": 0.012908935546875, + "learning_rate": 1.9245904257502714e-05, + "loss": 0.0005, + "num_tokens": 456267378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2123410429290774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009215620792490865, + "kl": 0.01348876953125, + "learning_rate": 1.9243632885044493e-05, + "loss": 0.0005, + "num_tokens": 456833122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2125117350857728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007721414266476123, + "kl": 0.012908935546875, + "learning_rate": 1.924135823143027e-05, + "loss": 0.0005, + "num_tokens": 457395586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21268242724246822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001086355578278036, + "kl": 0.012481689453125, + "learning_rate": 1.923908029746747e-05, + "loss": 0.0005, + "num_tokens": 457961090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2128531193991636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007607486919487609, + "kl": 0.0127105712890625, + "learning_rate": 1.9236799083964678e-05, + "loss": 0.0005, + "num_tokens": 458552114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.213023811555859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007980301072961263, + "kl": 0.0125885009765625, + "learning_rate": 1.9234514591731635e-05, + "loss": 0.0005, + "num_tokens": 459118946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2131945037125544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002577906151190833, + "kl": 0.0124359130859375, + "learning_rate": 1.923222682157926e-05, + "loss": 0.0005, + "num_tokens": 459682562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21336519586924982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007677603782791912, + "kl": 0.012603759765625, + "learning_rate": 1.9229935774319634e-05, + "loss": 0.0005, + "num_tokens": 460256818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2135358880259452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000387201957639892, + "kl": 0.0132598876953125, + "learning_rate": 1.9227641450765985e-05, + "loss": 0.0005, + "num_tokens": 460822466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2137065801826406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011517990230591449, + "kl": 0.0127410888671875, + "learning_rate": 1.9225343851732724e-05, + "loss": 0.0005, + "num_tokens": 461391218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.213877272339336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006617437129666936, + "kl": 0.0125885009765625, + "learning_rate": 1.9223042978035405e-05, + "loss": 0.0005, + "num_tokens": 461959874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21404796449603142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000312265218149261, + "kl": 0.0129852294921875, + "learning_rate": 1.9220738830490764e-05, + "loss": 0.0005, + "num_tokens": 462523986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2142186566527268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002611635336590485, + "kl": 0.01275634765625, + "learning_rate": 1.921843140991669e-05, + "loss": 0.0005, + "num_tokens": 463093058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2143893488094222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001632262055265947, + "kl": 0.0125732421875, + "learning_rate": 1.9216120717132227e-05, + "loss": 0.0005, + "num_tokens": 463657810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2145600409661176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003330501443045821, + "kl": 0.0126800537109375, + "learning_rate": 1.9213806752957594e-05, + "loss": 0.0005, + "num_tokens": 464224690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21473073312281302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000443301555811143, + "kl": 0.0130157470703125, + "learning_rate": 1.9211489518214165e-05, + "loss": 0.0005, + "num_tokens": 464795106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2149014252795084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003583068334688715, + "kl": 0.0117950439453125, + "learning_rate": 1.9209169013724472e-05, + "loss": 0.0005, + "num_tokens": 465357378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2150721174362038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033553524268116437, + "kl": 0.0123443603515625, + "learning_rate": 1.9206845240312212e-05, + "loss": 0.0005, + "num_tokens": 465922258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2152428095928992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045005340383265087, + "kl": 0.0124664306640625, + "learning_rate": 1.920451819880224e-05, + "loss": 0.0005, + "num_tokens": 466485234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21541350174959462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006499667491999548, + "kl": 0.0125885009765625, + "learning_rate": 1.920218789002057e-05, + "loss": 0.0005, + "num_tokens": 467050882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21558419390629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011323523320309108, + "kl": 0.011993408203125, + "learning_rate": 1.9199854314794377e-05, + "loss": 0.0005, + "num_tokens": 467622706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2157548860629854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005960119309575832, + "kl": 0.0123138427734375, + "learning_rate": 1.9197517473952e-05, + "loss": 0.0005, + "num_tokens": 468182290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2159255782196808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044798082623444843, + "kl": 0.012115478515625, + "learning_rate": 1.9195177368322932e-05, + "loss": 0.0005, + "num_tokens": 468743938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21609627037637621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004734212043220969, + "kl": 0.0125579833984375, + "learning_rate": 1.9192833998737823e-05, + "loss": 0.0005, + "num_tokens": 469310002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2162669625330716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005170930393614699, + "kl": 0.0123748779296875, + "learning_rate": 1.9190487366028486e-05, + "loss": 0.0005, + "num_tokens": 469877906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.216437654689767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011419920826373408, + "kl": 0.0120697021484375, + "learning_rate": 1.9188137471027898e-05, + "loss": 0.0005, + "num_tokens": 470437058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2166083468464624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004778779564055271, + "kl": 0.0119171142578125, + "learning_rate": 1.9185784314570172e-05, + "loss": 0.0005, + "num_tokens": 471003938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21677903900315781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006278445862830742, + "kl": 0.0122222900390625, + "learning_rate": 1.918342789749061e-05, + "loss": 0.0005, + "num_tokens": 471570546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2169497311598532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044249065091509464, + "kl": 0.012237548828125, + "learning_rate": 1.918106822062564e-05, + "loss": 0.0005, + "num_tokens": 472140594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2171204233165486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040264191884154454, + "kl": 0.011871337890625, + "learning_rate": 1.917870528481287e-05, + "loss": 0.0005, + "num_tokens": 472700802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.217291115473244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029355384069245413, + "kl": 0.012298583984375, + "learning_rate": 1.917633909089106e-05, + "loss": 0.0005, + "num_tokens": 473271554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2174618076299394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014401384816980275, + "kl": 0.012115478515625, + "learning_rate": 1.9173969639700117e-05, + "loss": 0.0005, + "num_tokens": 473834802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2176324997866348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021107074976383668, + "kl": 0.012054443359375, + "learning_rate": 1.9171596932081117e-05, + "loss": 0.0005, + "num_tokens": 474397202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2178031919433302, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.15433880540768e-05, + "kl": 0.0126190185546875, + "learning_rate": 1.916922096887628e-05, + "loss": 0.0005, + "num_tokens": 474957506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2179738841000256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004827957884378182, + "kl": 0.01263427734375, + "learning_rate": 1.9166841750928985e-05, + "loss": 0.0005, + "num_tokens": 475523762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.218144576256721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015565614687715524, + "kl": 0.0124053955078125, + "learning_rate": 1.9164459279083777e-05, + "loss": 0.0005, + "num_tokens": 476084898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2183152684134164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007265489756023848, + "kl": 0.0125579833984375, + "learning_rate": 1.916207355418634e-05, + "loss": 0.0005, + "num_tokens": 476654434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2184859605701118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008947197439415404, + "kl": 0.0133209228515625, + "learning_rate": 1.9159684577083525e-05, + "loss": 0.0005, + "num_tokens": 477214866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2186566527268072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014202236563502023, + "kl": 0.012969970703125, + "learning_rate": 1.915729234862333e-05, + "loss": 0.0005, + "num_tokens": 477779378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2188273448835026, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.954928031370089e-05, + "kl": 0.012115478515625, + "learning_rate": 1.9154896869654908e-05, + "loss": 0.0005, + "num_tokens": 478343714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.218998037040198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020989648673763443, + "kl": 0.012725830078125, + "learning_rate": 1.9152498141028575e-05, + "loss": 0.0005, + "num_tokens": 478911250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2191687291968934, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.026524503485888e-05, + "kl": 0.012786865234375, + "learning_rate": 1.9150096163595785e-05, + "loss": 0.0005, + "num_tokens": 479475490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2193394213535888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003274663876829903, + "kl": 0.0124359130859375, + "learning_rate": 1.9147690938209158e-05, + "loss": 0.0005, + "num_tokens": 480044066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2195101135102842, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.17171402148545e-05, + "kl": 0.01190185546875, + "learning_rate": 1.9145282465722457e-05, + "loss": 0.0005, + "num_tokens": 480609426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2196808056669796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004259480938222765, + "kl": 0.0123291015625, + "learning_rate": 1.914287074699061e-05, + "loss": 0.0005, + "num_tokens": 481174642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.219851497823675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023939631909089586, + "kl": 0.01263427734375, + "learning_rate": 1.9140455782869684e-05, + "loss": 0.0005, + "num_tokens": 481737634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2200221899803704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007971365495643656, + "kl": 0.012908935546875, + "learning_rate": 1.913803757421691e-05, + "loss": 0.0005, + "num_tokens": 482305138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2201928821370658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015954775680388476, + "kl": 0.012542724609375, + "learning_rate": 1.9135616121890657e-05, + "loss": 0.0005, + "num_tokens": 482866722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2203635742937612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025107026954351847, + "kl": 0.0120086669921875, + "learning_rate": 1.9133191426750463e-05, + "loss": 0.0005, + "num_tokens": 483439122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2205342664504566, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.892205446199783e-05, + "kl": 0.0124053955078125, + "learning_rate": 1.9130763489657e-05, + "loss": 0.0005, + "num_tokens": 484009778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.220704958607152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003413340523172457, + "kl": 0.012054443359375, + "learning_rate": 1.91283323114721e-05, + "loss": 0.0005, + "num_tokens": 484576434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2208756507638474, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.617494581249921e-05, + "kl": 0.0130615234375, + "learning_rate": 1.9125897893058737e-05, + "loss": 0.0005, + "num_tokens": 485144946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2210463429205428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00472826110274963, + "kl": 0.012481689453125, + "learning_rate": 1.9123460235281055e-05, + "loss": 0.0005, + "num_tokens": 485708914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2212170350772382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004566306938036918, + "kl": 0.011962890625, + "learning_rate": 1.9121019339004324e-05, + "loss": 0.0005, + "num_tokens": 486271090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2213877272339336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009729913532758745, + "kl": 0.0126190185546875, + "learning_rate": 1.9118575205094983e-05, + "loss": 0.0005, + "num_tokens": 486836642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.221558419390629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012709541109596708, + "kl": 0.0122833251953125, + "learning_rate": 1.91161278344206e-05, + "loss": 0.0005, + "num_tokens": 487398946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2217291115473244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002017473517645642, + "kl": 0.0126495361328125, + "learning_rate": 1.9113677227849908e-05, + "loss": 0.0005, + "num_tokens": 487968066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2218998037040198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03787060723049592, + "kl": 0.0150604248046875, + "learning_rate": 1.911122338625279e-05, + "loss": 0.0006, + "num_tokens": 488533890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2220704958607152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034506393969848634, + "kl": 0.0124359130859375, + "learning_rate": 1.9108766310500265e-05, + "loss": 0.0005, + "num_tokens": 489100498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2222411880174106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005923044245253577, + "kl": 0.011962890625, + "learning_rate": 1.9106306001464507e-05, + "loss": 0.0005, + "num_tokens": 489661298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.222411880174106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007886009472723877, + "kl": 0.0121612548828125, + "learning_rate": 1.9103842460018837e-05, + "loss": 0.0005, + "num_tokens": 490227730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2225825723308014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007824493725816624, + "kl": 0.01239013671875, + "learning_rate": 1.9101375687037722e-05, + "loss": 0.0005, + "num_tokens": 490795842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2227532644874968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009592765774319291, + "kl": 0.0124053955078125, + "learning_rate": 1.909890568339678e-05, + "loss": 0.0005, + "num_tokens": 491363922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2229239566441922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021082167136638532, + "kl": 0.013092041015625, + "learning_rate": 1.9096432449972772e-05, + "loss": 0.0005, + "num_tokens": 491929586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2230946488008876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016949275534065803, + "kl": 0.0128631591796875, + "learning_rate": 1.9093955987643605e-05, + "loss": 0.0005, + "num_tokens": 492498674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.223265340957583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00181497109716214, + "kl": 0.0128173828125, + "learning_rate": 1.909147629728834e-05, + "loss": 0.0005, + "num_tokens": 493065394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2234360331142784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001818886390840428, + "kl": 0.0125579833984375, + "learning_rate": 1.908899337978717e-05, + "loss": 0.0005, + "num_tokens": 493631314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2236067252709738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021098751789824064, + "kl": 0.0128173828125, + "learning_rate": 1.908650723602144e-05, + "loss": 0.0005, + "num_tokens": 494192178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2237774174276692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018362189198493227, + "kl": 0.0132598876953125, + "learning_rate": 1.9084017866873653e-05, + "loss": 0.0005, + "num_tokens": 494756290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2239481095843646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019592733180537896, + "kl": 0.012603759765625, + "learning_rate": 1.9081525273227433e-05, + "loss": 0.0005, + "num_tokens": 495322050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.22411880174106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018819836393633674, + "kl": 0.01287841796875, + "learning_rate": 1.9079029455967567e-05, + "loss": 0.0005, + "num_tokens": 495885426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2242894938977554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001697341955334946, + "kl": 0.0133209228515625, + "learning_rate": 1.9076530415979976e-05, + "loss": 0.0005, + "num_tokens": 496448658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2244601860544508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017232887994558274, + "kl": 0.012176513671875, + "learning_rate": 1.907402815415173e-05, + "loss": 0.0005, + "num_tokens": 497011058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2246308782111462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001856633808913352, + "kl": 0.0126953125, + "learning_rate": 1.907152267137105e-05, + "loss": 0.0005, + "num_tokens": 497571874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2248015703678416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010672031072744674, + "kl": 0.0130767822265625, + "learning_rate": 1.9069013968527276e-05, + "loss": 0.0005, + "num_tokens": 498138434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.224972262524537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012318867470211216, + "kl": 0.0127410888671875, + "learning_rate": 1.906650204651092e-05, + "loss": 0.0005, + "num_tokens": 498707746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2251429546812324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011863055274880774, + "kl": 0.0122222900390625, + "learning_rate": 1.9063986906213622e-05, + "loss": 0.0005, + "num_tokens": 499271826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2253136468379278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011146447056748718, + "kl": 0.012451171875, + "learning_rate": 1.9061468548528158e-05, + "loss": 0.0005, + "num_tokens": 499835602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2254843389946232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032785493425824106, + "kl": 0.0125732421875, + "learning_rate": 1.9058946974348464e-05, + "loss": 0.0005, + "num_tokens": 500399346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2256550311513186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015688103990367168, + "kl": 0.0124053955078125, + "learning_rate": 1.9056422184569604e-05, + "loss": 0.0005, + "num_tokens": 500967874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.225825723308014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015332177011023937, + "kl": 0.0122222900390625, + "learning_rate": 1.9053894180087784e-05, + "loss": 0.0005, + "num_tokens": 501532290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2259964154647094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001031456679663517, + "kl": 0.012664794921875, + "learning_rate": 1.9051362961800356e-05, + "loss": 0.0005, + "num_tokens": 502099714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2261671076214048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010411111248133092, + "kl": 0.012359619140625, + "learning_rate": 1.9048828530605812e-05, + "loss": 0.0005, + "num_tokens": 502667522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2263377997781002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009942639832225613, + "kl": 0.0127105712890625, + "learning_rate": 1.9046290887403788e-05, + "loss": 0.0005, + "num_tokens": 503238450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2265084919347956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009223761499921097, + "kl": 0.012237548828125, + "learning_rate": 1.9043750033095046e-05, + "loss": 0.0005, + "num_tokens": 503801874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.226679184091491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005121088653121935, + "kl": 0.012939453125, + "learning_rate": 1.9041205968581505e-05, + "loss": 0.0005, + "num_tokens": 504365330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2268498762481864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011149700457048727, + "kl": 0.01226806640625, + "learning_rate": 1.903865869476621e-05, + "loss": 0.0005, + "num_tokens": 504929922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2270205684048818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037645799570098864, + "kl": 0.0122833251953125, + "learning_rate": 1.9036108212553364e-05, + "loss": 0.0005, + "num_tokens": 505500834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2271912605615772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022060526101811366, + "kl": 0.0121307373046875, + "learning_rate": 1.9033554522848282e-05, + "loss": 0.0005, + "num_tokens": 506069810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2273619527182726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000504843777779536, + "kl": 0.012847900390625, + "learning_rate": 1.9030997626557437e-05, + "loss": 0.0005, + "num_tokens": 506634162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.227532644874968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004974886402518367, + "kl": 0.0123291015625, + "learning_rate": 1.9028437524588433e-05, + "loss": 0.0005, + "num_tokens": 507202082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2277033370316634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005089081047840407, + "kl": 0.012176513671875, + "learning_rate": 1.902587421785002e-05, + "loss": 0.0005, + "num_tokens": 507767186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2278740291883588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010168934711802132, + "kl": 0.0125274658203125, + "learning_rate": 1.902330770725207e-05, + "loss": 0.0005, + "num_tokens": 508331442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.22804472134505419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039776710812790825, + "kl": 0.012298583984375, + "learning_rate": 1.902073799370561e-05, + "loss": 0.0005, + "num_tokens": 508894674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2282154135017496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011162838732788941, + "kl": 0.012420654296875, + "learning_rate": 1.9018165078122792e-05, + "loss": 0.0005, + "num_tokens": 509458578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.228386105658445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005985101917429414, + "kl": 0.0120697021484375, + "learning_rate": 1.901558896141691e-05, + "loss": 0.0005, + "num_tokens": 510026546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2285567978151404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007413948381393122, + "kl": 0.012115478515625, + "learning_rate": 1.9013009644502386e-05, + "loss": 0.0005, + "num_tokens": 510595842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.22872748997183578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003331746496369367, + "kl": 0.0125885009765625, + "learning_rate": 1.9010427128294792e-05, + "loss": 0.0005, + "num_tokens": 511162610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2288981821285312, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.454696960670658e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.9007841413710827e-05, + "loss": 0.0005, + "num_tokens": 511722562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2290688742852266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006544713499499937, + "kl": 0.012481689453125, + "learning_rate": 1.9005252501668322e-05, + "loss": 0.0005, + "num_tokens": 512284818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.229239566441922, + "frac_reward_zero_std": 1.0, + "grad_norm": 17955.520331935208, + "kl": 355.0, + "learning_rate": 1.9002660393086253e-05, + "loss": 14.2251, + "num_tokens": 512879010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.22941025859861738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031304525156557834, + "kl": 0.0126800537109375, + "learning_rate": 1.900006508888472e-05, + "loss": 0.0005, + "num_tokens": 513441298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2295809507553128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014521030559436921, + "kl": 0.012664794921875, + "learning_rate": 1.899746658998496e-05, + "loss": 0.0005, + "num_tokens": 514012402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2297516429120082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014902945680755526, + "kl": 0.0130157470703125, + "learning_rate": 1.8994864897309355e-05, + "loss": 0.0005, + "num_tokens": 514574850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2299223350687036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00284912704701966, + "kl": 0.0132293701171875, + "learning_rate": 1.8992260011781403e-05, + "loss": 0.0005, + "num_tokens": 515139282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23009302722539898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003372285004188178, + "kl": 0.0137176513671875, + "learning_rate": 1.8989651934325755e-05, + "loss": 0.0005, + "num_tokens": 515705890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2302637193820944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035094468943209535, + "kl": 0.0139617919921875, + "learning_rate": 1.8987040665868174e-05, + "loss": 0.0006, + "num_tokens": 516268226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2304344115387898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006062951742856982, + "kl": 0.014678955078125, + "learning_rate": 1.8984426207335575e-05, + "loss": 0.0006, + "num_tokens": 516841298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2306051036954852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005618984630804017, + "kl": 0.0139923095703125, + "learning_rate": 1.8981808559655987e-05, + "loss": 0.0006, + "num_tokens": 517408002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23077579585218058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002943659166052578, + "kl": 0.0151214599609375, + "learning_rate": 1.897918772375858e-05, + "loss": 0.0006, + "num_tokens": 517976466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.230946488008876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006702852141455894, + "kl": 0.014129638671875, + "learning_rate": 1.8976563700573672e-05, + "loss": 0.0006, + "num_tokens": 518549250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2311171801655714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007015027257055901, + "kl": 0.0152435302734375, + "learning_rate": 1.8973936491032677e-05, + "loss": 0.0006, + "num_tokens": 519114178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2312878723222668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005287309635785798, + "kl": 0.0152435302734375, + "learning_rate": 1.8971306096068175e-05, + "loss": 0.0006, + "num_tokens": 519676354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23145856447896218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005359972624926689, + "kl": 0.0148773193359375, + "learning_rate": 1.896867251661385e-05, + "loss": 0.0006, + "num_tokens": 520240018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2316292566356576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006044744023184654, + "kl": 0.0154571533203125, + "learning_rate": 1.8966035753604536e-05, + "loss": 0.0006, + "num_tokens": 520807202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.231799948792353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005759503393497269, + "kl": 0.0147552490234375, + "learning_rate": 1.896339580797618e-05, + "loss": 0.0006, + "num_tokens": 521372978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2319706409490484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031456391339227986, + "kl": 0.0144805908203125, + "learning_rate": 1.8960752680665876e-05, + "loss": 0.0006, + "num_tokens": 521940194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23214133310574378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004498446767837416, + "kl": 0.0145721435546875, + "learning_rate": 1.895810637261183e-05, + "loss": 0.0006, + "num_tokens": 522512610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2323120252624392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003339609175501766, + "kl": 0.0139312744140625, + "learning_rate": 1.8955456884753396e-05, + "loss": 0.0006, + "num_tokens": 523076418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2324827174191346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032181703537855167, + "kl": 0.014373779296875, + "learning_rate": 1.8952804218031038e-05, + "loss": 0.0006, + "num_tokens": 523640706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23265340957583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002404221968277674, + "kl": 0.013946533203125, + "learning_rate": 1.8950148373386365e-05, + "loss": 0.0006, + "num_tokens": 524207746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23282410173252538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020381272569912545, + "kl": 0.0138092041015625, + "learning_rate": 1.8947489351762094e-05, + "loss": 0.0006, + "num_tokens": 524770706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23299479388922079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017350627139895124, + "kl": 0.0134124755859375, + "learning_rate": 1.8944827154102096e-05, + "loss": 0.0005, + "num_tokens": 525334626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2331654860459162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008394594499865885, + "kl": 0.01324462890625, + "learning_rate": 1.8942161781351346e-05, + "loss": 0.0005, + "num_tokens": 525909666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2333361782026116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001762626413016968, + "kl": 0.0134124755859375, + "learning_rate": 1.8939493234455953e-05, + "loss": 0.0005, + "num_tokens": 526477794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23350687035930698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014808128901773131, + "kl": 0.0129547119140625, + "learning_rate": 1.893682151436316e-05, + "loss": 0.0005, + "num_tokens": 527038098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23367756251600239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009553943534831295, + "kl": 0.0128021240234375, + "learning_rate": 1.8934146622021336e-05, + "loss": 0.0005, + "num_tokens": 527605426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2338482546726978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005180082760350259, + "kl": 0.0136566162109375, + "learning_rate": 1.893146855837996e-05, + "loss": 0.0005, + "num_tokens": 528171426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2340189468293932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008652922974548126, + "kl": 0.0127410888671875, + "learning_rate": 1.8928787324389656e-05, + "loss": 0.0005, + "num_tokens": 528737058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23418963898608858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036621037775474855, + "kl": 0.0133209228515625, + "learning_rate": 1.892610292100216e-05, + "loss": 0.0005, + "num_tokens": 529301890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23436033114278398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010042307239386153, + "kl": 0.0128936767578125, + "learning_rate": 1.8923415349170346e-05, + "loss": 0.0005, + "num_tokens": 529870034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2345310232994794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010979384592665767, + "kl": 0.01336669921875, + "learning_rate": 1.8920724609848193e-05, + "loss": 0.0005, + "num_tokens": 530435474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2347017154561748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026557095360097956, + "kl": 0.0126190185546875, + "learning_rate": 1.8918030703990827e-05, + "loss": 0.0005, + "num_tokens": 531025778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23487240761287018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008416673639421858, + "kl": 0.0122833251953125, + "learning_rate": 1.8915333632554483e-05, + "loss": 0.0005, + "num_tokens": 531593186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23504309976956558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008056424514449787, + "kl": 0.0129852294921875, + "learning_rate": 1.8912633396496522e-05, + "loss": 0.0005, + "num_tokens": 532159298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.235213791926261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006277048099608083, + "kl": 0.013427734375, + "learning_rate": 1.8909929996775436e-05, + "loss": 0.0005, + "num_tokens": 532727378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2353844840829564, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.072400919569016e-05, + "kl": 0.012298583984375, + "learning_rate": 1.890722343435083e-05, + "loss": 0.0005, + "num_tokens": 533295138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23555517623965178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005774878172699975, + "kl": 0.0129852294921875, + "learning_rate": 1.890451371018344e-05, + "loss": 0.0005, + "num_tokens": 533860882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23572586839634718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026962497628505615, + "kl": 0.01239013671875, + "learning_rate": 1.8901800825235113e-05, + "loss": 0.0005, + "num_tokens": 534425442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2358965605530426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003693461375921285, + "kl": 0.0125732421875, + "learning_rate": 1.889908478046883e-05, + "loss": 0.0005, + "num_tokens": 534991682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.236067252709738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002997051540102523, + "kl": 0.0128173828125, + "learning_rate": 1.889636557684869e-05, + "loss": 0.0005, + "num_tokens": 535560514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23623794486643337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007884629923549184, + "kl": 0.0128936767578125, + "learning_rate": 1.889364321533991e-05, + "loss": 0.0005, + "num_tokens": 536134882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23640863702312878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007487038438457876, + "kl": 0.0131988525390625, + "learning_rate": 1.889091769690883e-05, + "loss": 0.0005, + "num_tokens": 536696178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2365793291798242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006053160099444055, + "kl": 0.011962890625, + "learning_rate": 1.8888189022522914e-05, + "loss": 0.0005, + "num_tokens": 537258770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2367500213365196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024770310308124734, + "kl": 0.0126800537109375, + "learning_rate": 1.888545719315074e-05, + "loss": 0.0005, + "num_tokens": 537825538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23692071349321497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004902527787880316, + "kl": 0.0128173828125, + "learning_rate": 1.8882722209762002e-05, + "loss": 0.0005, + "num_tokens": 538385490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23709140564991038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004743367503080445, + "kl": 0.012786865234375, + "learning_rate": 1.8879984073327536e-05, + "loss": 0.0005, + "num_tokens": 538946754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2372620978066058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005243727890750186, + "kl": 0.012603759765625, + "learning_rate": 1.887724278481927e-05, + "loss": 0.0005, + "num_tokens": 539513298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2374327899633012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002584152763992712, + "kl": 0.01220703125, + "learning_rate": 1.8874498345210265e-05, + "loss": 0.0005, + "num_tokens": 540080354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23760348211999657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003422582247239372, + "kl": 0.012451171875, + "learning_rate": 1.8871750755474697e-05, + "loss": 0.0005, + "num_tokens": 540646946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23777417427669198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045629134823624296, + "kl": 0.0123291015625, + "learning_rate": 1.8869000016587865e-05, + "loss": 0.0005, + "num_tokens": 541211346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2379448664333874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003874307844131046, + "kl": 0.0127105712890625, + "learning_rate": 1.886624612952618e-05, + "loss": 0.0005, + "num_tokens": 541777298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2381155585900828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010145992031051615, + "kl": 0.01275634765625, + "learning_rate": 1.886348909526717e-05, + "loss": 0.0005, + "num_tokens": 542338450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23828625074677817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044978277958621456, + "kl": 0.0116729736328125, + "learning_rate": 1.8860728914789485e-05, + "loss": 0.0005, + "num_tokens": 542905874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23845694290347358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022194292589691958, + "kl": 0.0123138427734375, + "learning_rate": 1.885796558907289e-05, + "loss": 0.0005, + "num_tokens": 543470834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23862763506016899, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015140268170667917, + "kl": 0.0121307373046875, + "learning_rate": 1.8855199119098266e-05, + "loss": 0.0005, + "num_tokens": 544032242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2387983272168644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008849089405576157, + "kl": 0.0132904052734375, + "learning_rate": 1.8852429505847607e-05, + "loss": 0.0005, + "num_tokens": 544594930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23896901937355977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004711567928760644, + "kl": 0.012298583984375, + "learning_rate": 1.884965675030403e-05, + "loss": 0.0005, + "num_tokens": 545157090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23913971153025518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005191594989795891, + "kl": 0.0127716064453125, + "learning_rate": 1.8846880853451762e-05, + "loss": 0.0005, + "num_tokens": 545716978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23931040368695058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005962456246623143, + "kl": 0.0128173828125, + "learning_rate": 1.884410181627614e-05, + "loss": 0.0005, + "num_tokens": 546290322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.239481095843646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006625544796970163, + "kl": 0.012725830078125, + "learning_rate": 1.884131963976363e-05, + "loss": 0.0005, + "num_tokens": 546856450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2396517880003414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00226322905701838, + "kl": 0.0130767822265625, + "learning_rate": 1.88385343249018e-05, + "loss": 0.0005, + "num_tokens": 547427442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23982248015703678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006163173505071652, + "kl": 0.012908935546875, + "learning_rate": 1.883574587267934e-05, + "loss": 0.0005, + "num_tokens": 547989906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23999317231373218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001911214110142673, + "kl": 0.0123138427734375, + "learning_rate": 1.8832954284086046e-05, + "loss": 0.0005, + "num_tokens": 548550386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2401638644704276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030932743089521754, + "kl": 0.0128936767578125, + "learning_rate": 1.8830159560112835e-05, + "loss": 0.0005, + "num_tokens": 549116626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.240334556627123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005409474111172469, + "kl": 0.0125732421875, + "learning_rate": 1.8827361701751725e-05, + "loss": 0.0005, + "num_tokens": 549680274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24050524878381838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006417191613295324, + "kl": 0.012481689453125, + "learning_rate": 1.8824560709995864e-05, + "loss": 0.0005, + "num_tokens": 550242162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24067594094051378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006887303023229787, + "kl": 0.012542724609375, + "learning_rate": 1.88217565858395e-05, + "loss": 0.0005, + "num_tokens": 550811378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2408466330972092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018165785613236547, + "kl": 0.011871337890625, + "learning_rate": 1.881894933027799e-05, + "loss": 0.0005, + "num_tokens": 551379794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2410173252539046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005064815237418897, + "kl": 0.0127105712890625, + "learning_rate": 1.881613894430782e-05, + "loss": 0.0005, + "num_tokens": 551946466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24118801741059998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005057985692011094, + "kl": 0.01275634765625, + "learning_rate": 1.8813325428926565e-05, + "loss": 0.0005, + "num_tokens": 552513554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24135870956729538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005573664324445214, + "kl": 0.012420654296875, + "learning_rate": 1.8810508785132925e-05, + "loss": 0.0005, + "num_tokens": 553083010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2415294017239908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014885033415161064, + "kl": 0.0122833251953125, + "learning_rate": 1.8807689013926712e-05, + "loss": 0.0005, + "num_tokens": 553651250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2417000938806862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003527167880939245, + "kl": 0.0126190185546875, + "learning_rate": 1.8804866116308835e-05, + "loss": 0.0005, + "num_tokens": 554218402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24187078603738157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005940809370845089, + "kl": 0.0126495361328125, + "learning_rate": 1.8802040093281323e-05, + "loss": 0.0005, + "num_tokens": 554780914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24204147819407698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005196212389607318, + "kl": 0.012603759765625, + "learning_rate": 1.8799210945847318e-05, + "loss": 0.0005, + "num_tokens": 555341842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2422121703507724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002813716916851144, + "kl": 0.01239013671875, + "learning_rate": 1.8796378675011057e-05, + "loss": 0.0005, + "num_tokens": 555905042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2423828625074678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004130137018614212, + "kl": 0.0124359130859375, + "learning_rate": 1.8793543281777897e-05, + "loss": 0.0005, + "num_tokens": 556467234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24255355466416317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002637928162776924, + "kl": 0.0129547119140625, + "learning_rate": 1.8790704767154302e-05, + "loss": 0.0005, + "num_tokens": 557031954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24272424682085858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005022395379117973, + "kl": 0.01220703125, + "learning_rate": 1.878786313214784e-05, + "loss": 0.0005, + "num_tokens": 557598002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.242894938977554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025467070270637825, + "kl": 0.012725830078125, + "learning_rate": 1.878501837776719e-05, + "loss": 0.0005, + "num_tokens": 558163842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2430656311342494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003710278605514647, + "kl": 0.0126953125, + "learning_rate": 1.8782170505022137e-05, + "loss": 0.0005, + "num_tokens": 558727762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24323632329094477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029843767837569376, + "kl": 0.01300048828125, + "learning_rate": 1.8779319514923574e-05, + "loss": 0.0005, + "num_tokens": 559292146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24340701544764018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005906795472490442, + "kl": 0.0121307373046875, + "learning_rate": 1.87764654084835e-05, + "loss": 0.0005, + "num_tokens": 559856434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24357770760433559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042755734763668047, + "kl": 0.0124664306640625, + "learning_rate": 1.877360818671501e-05, + "loss": 0.0005, + "num_tokens": 560424546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.243748399761031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040787695890373864, + "kl": 0.01220703125, + "learning_rate": 1.877074785063233e-05, + "loss": 0.0005, + "num_tokens": 560987698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24391909191772637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005366511603358326, + "kl": 0.012176513671875, + "learning_rate": 1.8767884401250766e-05, + "loss": 0.0005, + "num_tokens": 561550162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24408978407442178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024253713066801122, + "kl": 0.0125274658203125, + "learning_rate": 1.8765017839586742e-05, + "loss": 0.0005, + "num_tokens": 562112354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24426047623111719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005357456035933348, + "kl": 0.012176513671875, + "learning_rate": 1.8762148166657787e-05, + "loss": 0.0005, + "num_tokens": 562680738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2444311683878126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016694093539997105, + "kl": 0.012115478515625, + "learning_rate": 1.8759275383482525e-05, + "loss": 0.0005, + "num_tokens": 563244626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24460186054450797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005256256697325707, + "kl": 0.012298583984375, + "learning_rate": 1.8756399491080696e-05, + "loss": 0.0005, + "num_tokens": 563806578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24477255270120338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026098123911526565, + "kl": 0.0125885009765625, + "learning_rate": 1.8753520490473134e-05, + "loss": 0.0005, + "num_tokens": 564372674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24494324485789878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002464879089956361, + "kl": 0.0126495361328125, + "learning_rate": 1.875063838268178e-05, + "loss": 0.0005, + "num_tokens": 564936738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2451139370145942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023711761653445463, + "kl": 0.0123443603515625, + "learning_rate": 1.8747753168729687e-05, + "loss": 0.0005, + "num_tokens": 565501650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24528462917128957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003837007771247971, + "kl": 0.0121612548828125, + "learning_rate": 1.8744864849640988e-05, + "loss": 0.0005, + "num_tokens": 566068386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24545532132798498, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000265759927018069, + "kl": 0.01275634765625, + "learning_rate": 1.8741973426440944e-05, + "loss": 0.0005, + "num_tokens": 566632546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24562601348468038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006083181297313115, + "kl": 0.012451171875, + "learning_rate": 1.8739078900155894e-05, + "loss": 0.0005, + "num_tokens": 567198754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2457967056413758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006179371028258357, + "kl": 0.0124053955078125, + "learning_rate": 1.87361812718133e-05, + "loss": 0.0005, + "num_tokens": 567756706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24596739779807117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004570361190303942, + "kl": 0.0124359130859375, + "learning_rate": 1.8733280542441713e-05, + "loss": 0.0005, + "num_tokens": 568322658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24613808995476658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003530424215035646, + "kl": 0.0125732421875, + "learning_rate": 1.873037671307079e-05, + "loss": 0.0005, + "num_tokens": 568881586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24630878211146198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009576346650318183, + "kl": 0.012298583984375, + "learning_rate": 1.8727469784731277e-05, + "loss": 0.0005, + "num_tokens": 569447218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2464794742681574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038113237699577966, + "kl": 0.0124664306640625, + "learning_rate": 1.8724559758455037e-05, + "loss": 0.0005, + "num_tokens": 570002786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24665016642485277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040435683230507445, + "kl": 0.012298583984375, + "learning_rate": 1.8721646635275022e-05, + "loss": 0.0005, + "num_tokens": 570567250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24682085858154817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031102371980049396, + "kl": 0.0125579833984375, + "learning_rate": 1.871873041622528e-05, + "loss": 0.0005, + "num_tokens": 571130738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24699155073824358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023316327514504256, + "kl": 0.0128631591796875, + "learning_rate": 1.8715811102340975e-05, + "loss": 0.0005, + "num_tokens": 571702354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.247162242894939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006040838858952705, + "kl": 0.01312255859375, + "learning_rate": 1.871288869465835e-05, + "loss": 0.0005, + "num_tokens": 572266370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24733293505163437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003242256005415194, + "kl": 0.012481689453125, + "learning_rate": 1.8709963194214752e-05, + "loss": 0.0005, + "num_tokens": 572826530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24750362720832977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004571008943347845, + "kl": 0.0125274658203125, + "learning_rate": 1.8707034602048636e-05, + "loss": 0.0005, + "num_tokens": 573396594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24767431936502518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002668528601590774, + "kl": 0.0123291015625, + "learning_rate": 1.870410291919954e-05, + "loss": 0.0005, + "num_tokens": 573969634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2478450115217206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008762629580943129, + "kl": 0.012664794921875, + "learning_rate": 1.8701168146708104e-05, + "loss": 0.0005, + "num_tokens": 574533106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24801570367841597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020041076715520938, + "kl": 0.01275634765625, + "learning_rate": 1.8698230285616073e-05, + "loss": 0.0005, + "num_tokens": 575118226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24818639583511137, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.548532262272918e-05, + "kl": 0.0121002197265625, + "learning_rate": 1.869528933696628e-05, + "loss": 0.0005, + "num_tokens": 575682178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24835708799180678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023025525108814123, + "kl": 0.012481689453125, + "learning_rate": 1.8692345301802654e-05, + "loss": 0.0005, + "num_tokens": 576244498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2485277801485022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003624263135919732, + "kl": 0.01263427734375, + "learning_rate": 1.868939818117022e-05, + "loss": 0.0005, + "num_tokens": 576813554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24869847230519757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027065412286174964, + "kl": 0.0121307373046875, + "learning_rate": 1.8686447976115103e-05, + "loss": 0.0005, + "num_tokens": 577381778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24886916446189297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001036126171509453, + "kl": 0.0124053955078125, + "learning_rate": 1.8683494687684517e-05, + "loss": 0.0005, + "num_tokens": 577943282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24903985661858838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006351403995590996, + "kl": 0.012664794921875, + "learning_rate": 1.868053831692677e-05, + "loss": 0.0005, + "num_tokens": 578506162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24921054877528379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004082705119408757, + "kl": 0.01275634765625, + "learning_rate": 1.8677578864891274e-05, + "loss": 0.0005, + "num_tokens": 579069474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24938124093197916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007522261795193978, + "kl": 0.011871337890625, + "learning_rate": 1.867461633262852e-05, + "loss": 0.0005, + "num_tokens": 579642770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24955193308867457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026624817046526914, + "kl": 0.0118560791015625, + "learning_rate": 1.8671650721190105e-05, + "loss": 0.0005, + "num_tokens": 580202418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24972262524536998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034900475686465746, + "kl": 0.0128326416015625, + "learning_rate": 1.8668682031628712e-05, + "loss": 0.0005, + "num_tokens": 580765234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24989331740206538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007553157775042211, + "kl": 0.0123748779296875, + "learning_rate": 1.8665710264998124e-05, + "loss": 0.0005, + "num_tokens": 581328722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2500640095587608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004902183150274902, + "kl": 0.0128326416015625, + "learning_rate": 1.86627354223532e-05, + "loss": 0.0005, + "num_tokens": 581897378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2502347017154562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006705269663149097, + "kl": 0.012542724609375, + "learning_rate": 1.8659757504749908e-05, + "loss": 0.0005, + "num_tokens": 582468274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25040539387215155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036846343928445465, + "kl": 0.0122222900390625, + "learning_rate": 1.8656776513245308e-05, + "loss": 0.0005, + "num_tokens": 583037250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25057608602884696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000738000759449641, + "kl": 0.01275634765625, + "learning_rate": 1.865379244889753e-05, + "loss": 0.0005, + "num_tokens": 583604450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25074677818554236, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.23564098975356e-05, + "kl": 0.012603759765625, + "learning_rate": 1.865080531276582e-05, + "loss": 0.0005, + "num_tokens": 584168034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25091747034223777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005279203511590148, + "kl": 0.0121002197265625, + "learning_rate": 1.8647815105910498e-05, + "loss": 0.0005, + "num_tokens": 584736770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2510881624989332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029186627599649687, + "kl": 0.0125732421875, + "learning_rate": 1.8644821829392984e-05, + "loss": 0.0005, + "num_tokens": 585306242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2512588546556286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036081524544044586, + "kl": 0.0121002197265625, + "learning_rate": 1.864182548427578e-05, + "loss": 0.0005, + "num_tokens": 585869106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.251429546812324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005656717344619329, + "kl": 0.0127716064453125, + "learning_rate": 1.863882607162248e-05, + "loss": 0.0005, + "num_tokens": 586431138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2516002389690194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031215099153150363, + "kl": 0.0124969482421875, + "learning_rate": 1.863582359249777e-05, + "loss": 0.0005, + "num_tokens": 586993362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25177093112571475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013364568598783468, + "kl": 0.012054443359375, + "learning_rate": 1.863281804796742e-05, + "loss": 0.0005, + "num_tokens": 587553666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25194162328241015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040099542176827684, + "kl": 0.012969970703125, + "learning_rate": 1.862980943909829e-05, + "loss": 0.0005, + "num_tokens": 588120498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25211231543910556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021449355659876218, + "kl": 0.013092041015625, + "learning_rate": 1.8626797766958323e-05, + "loss": 0.0005, + "num_tokens": 588688050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25228300759580097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022553118006549892, + "kl": 0.0124664306640625, + "learning_rate": 1.8623783032616562e-05, + "loss": 0.0005, + "num_tokens": 589249746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2524536997524964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046195064727583746, + "kl": 0.0128631591796875, + "learning_rate": 1.8620765237143127e-05, + "loss": 0.0005, + "num_tokens": 589818210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2526243919091918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004945158691714441, + "kl": 0.011993408203125, + "learning_rate": 1.861774438160922e-05, + "loss": 0.0005, + "num_tokens": 590381410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2527950840658872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000688695494209296, + "kl": 0.012908935546875, + "learning_rate": 1.861472046708714e-05, + "loss": 0.0005, + "num_tokens": 590944914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2529657762225826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003236024897133815, + "kl": 0.012786865234375, + "learning_rate": 1.861169349465027e-05, + "loss": 0.0005, + "num_tokens": 591510994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25313646837927795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034345052562669465, + "kl": 0.01220703125, + "learning_rate": 1.8608663465373077e-05, + "loss": 0.0005, + "num_tokens": 592074482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25330716053597335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005643476297106093, + "kl": 0.0123291015625, + "learning_rate": 1.8605630380331104e-05, + "loss": 0.0005, + "num_tokens": 592650082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25347785269266876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036129545476775305, + "kl": 0.01263427734375, + "learning_rate": 1.8602594240600993e-05, + "loss": 0.0005, + "num_tokens": 593213746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25364854484936417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005353834711542697, + "kl": 0.01202392578125, + "learning_rate": 1.859955504726046e-05, + "loss": 0.0005, + "num_tokens": 593782930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2538192370060596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005511606153366306, + "kl": 0.012420654296875, + "learning_rate": 1.8596512801388317e-05, + "loss": 0.0005, + "num_tokens": 594347506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.253989929162755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022296297173385863, + "kl": 0.0123291015625, + "learning_rate": 1.8593467504064442e-05, + "loss": 0.0005, + "num_tokens": 594910130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2541606213194504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005284169601466906, + "kl": 0.0123291015625, + "learning_rate": 1.859041915636981e-05, + "loss": 0.0005, + "num_tokens": 595477506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2543313134761458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000377066041570461, + "kl": 0.012603759765625, + "learning_rate": 1.858736775938647e-05, + "loss": 0.0005, + "num_tokens": 596039538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25450200563284114, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.470172883810654e-05, + "kl": 0.01287841796875, + "learning_rate": 1.8584313314197568e-05, + "loss": 0.0005, + "num_tokens": 596605922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25467269778953655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003497075853729192, + "kl": 0.0125885009765625, + "learning_rate": 1.8581255821887312e-05, + "loss": 0.0005, + "num_tokens": 597171186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25484338994623196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024343877652106092, + "kl": 0.0134735107421875, + "learning_rate": 1.8578195283541004e-05, + "loss": 0.0005, + "num_tokens": 597733778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25501408210292736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004776567714179982, + "kl": 0.012908935546875, + "learning_rate": 1.8575131700245027e-05, + "loss": 0.0005, + "num_tokens": 598307122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25518477425962277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033155049314513064, + "kl": 0.0121917724609375, + "learning_rate": 1.8572065073086843e-05, + "loss": 0.0005, + "num_tokens": 598869874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2553554664163182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004770634691590476, + "kl": 0.0123291015625, + "learning_rate": 1.8568995403154992e-05, + "loss": 0.0005, + "num_tokens": 599435106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2555261585730136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023970353303214968, + "kl": 0.013092041015625, + "learning_rate": 1.8565922691539092e-05, + "loss": 0.0005, + "num_tokens": 600003090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.255696850729709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045449892704357695, + "kl": 0.0120849609375, + "learning_rate": 1.856284693932985e-05, + "loss": 0.0005, + "num_tokens": 600575122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25586754288640434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019243741754074696, + "kl": 0.0128936767578125, + "learning_rate": 1.8559768147619043e-05, + "loss": 0.0005, + "num_tokens": 601141682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25603823504309975, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.699283961479965e-05, + "kl": 0.0126190185546875, + "learning_rate": 1.8556686317499536e-05, + "loss": 0.0005, + "num_tokens": 601704642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25620892719979516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004427928804578369, + "kl": 0.0123291015625, + "learning_rate": 1.8553601450065265e-05, + "loss": 0.0005, + "num_tokens": 602271810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25637961935649056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002659608403248092, + "kl": 0.01251220703125, + "learning_rate": 1.8550513546411248e-05, + "loss": 0.0005, + "num_tokens": 602838258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25655031151318597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005640284524533789, + "kl": 0.0124664306640625, + "learning_rate": 1.8547422607633574e-05, + "loss": 0.0005, + "num_tokens": 603406962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2567210036698814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002511564865647613, + "kl": 0.012847900390625, + "learning_rate": 1.8544328634829423e-05, + "loss": 0.0005, + "num_tokens": 603972514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2568916958265768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000582196045673033, + "kl": 0.0126800537109375, + "learning_rate": 1.8541231629097038e-05, + "loss": 0.0005, + "num_tokens": 604534370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2570623879832722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027891379136045746, + "kl": 0.0125885009765625, + "learning_rate": 1.853813159153574e-05, + "loss": 0.0005, + "num_tokens": 605103842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2572330801399676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001114219928896809, + "kl": 0.01239013671875, + "learning_rate": 1.853502852324594e-05, + "loss": 0.0005, + "num_tokens": 605670370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25740377229666295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005369164233472659, + "kl": 0.0124664306640625, + "learning_rate": 1.8531922425329112e-05, + "loss": 0.0005, + "num_tokens": 606238386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25757446445335835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017119042097856325, + "kl": 0.0128631591796875, + "learning_rate": 1.852881329888781e-05, + "loss": 0.0005, + "num_tokens": 606806498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25774515661005376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020343755258390813, + "kl": 0.0125885009765625, + "learning_rate": 1.8525701145025655e-05, + "loss": 0.0005, + "num_tokens": 607366018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25791584876674917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010210162084860952, + "kl": 0.0121307373046875, + "learning_rate": 1.8522585964847356e-05, + "loss": 0.0005, + "num_tokens": 607929394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2580865409234446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001527607071036861, + "kl": 0.0121307373046875, + "learning_rate": 1.8519467759458686e-05, + "loss": 0.0005, + "num_tokens": 608496274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25825723308014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003824557673516079, + "kl": 0.0129852294921875, + "learning_rate": 1.8516346529966496e-05, + "loss": 0.0005, + "num_tokens": 609061234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2584279252368354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003165891501062766, + "kl": 0.012939453125, + "learning_rate": 1.851322227747871e-05, + "loss": 0.0005, + "num_tokens": 609629090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2585986173935308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002498229885791763, + "kl": 0.0117645263671875, + "learning_rate": 1.8510095003104325e-05, + "loss": 0.0005, + "num_tokens": 610198802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25876930955022615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005069077833392936, + "kl": 0.0127410888671875, + "learning_rate": 1.8506964707953413e-05, + "loss": 0.0005, + "num_tokens": 610762914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25894000170692155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004979182191597604, + "kl": 0.0128173828125, + "learning_rate": 1.8503831393137114e-05, + "loss": 0.0005, + "num_tokens": 611324978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25911069386361696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007048812969652427, + "kl": 0.01220703125, + "learning_rate": 1.8500695059767638e-05, + "loss": 0.0005, + "num_tokens": 611889906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25928138602031237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004913086867524031, + "kl": 0.01287841796875, + "learning_rate": 1.8497555708958273e-05, + "loss": 0.0005, + "num_tokens": 612456242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25945207817700777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039087420656315177, + "kl": 0.0124969482421875, + "learning_rate": 1.849441334182338e-05, + "loss": 0.0005, + "num_tokens": 613024514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2596227703337032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001596616030165226, + "kl": 0.012725830078125, + "learning_rate": 1.8491267959478377e-05, + "loss": 0.0005, + "num_tokens": 613590194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2597934624903986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019201477278450987, + "kl": 0.01214599609375, + "learning_rate": 1.8488119563039766e-05, + "loss": 0.0005, + "num_tokens": 614154818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.259964154647094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005536965043705694, + "kl": 0.012481689453125, + "learning_rate": 1.8484968153625118e-05, + "loss": 0.0005, + "num_tokens": 614720994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26013484680378934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006153212993570614, + "kl": 0.0124969482421875, + "learning_rate": 1.8481813732353064e-05, + "loss": 0.0005, + "num_tokens": 615282658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26030553896048475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013876261454487867, + "kl": 0.0124359130859375, + "learning_rate": 1.8478656300343303e-05, + "loss": 0.0005, + "num_tokens": 615848754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26047623111718016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002127324632237195, + "kl": 0.0122833251953125, + "learning_rate": 1.8475495858716624e-05, + "loss": 0.0005, + "num_tokens": 616417330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26064692327387556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009735288229272801, + "kl": 0.0128326416015625, + "learning_rate": 1.8472332408594864e-05, + "loss": 0.0005, + "num_tokens": 616979138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26081761543057097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022737724104465756, + "kl": 0.0132598876953125, + "learning_rate": 1.846916595110093e-05, + "loss": 0.0005, + "num_tokens": 617542898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2609883075872664, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.488869579717535e-05, + "kl": 0.0123443603515625, + "learning_rate": 1.8465996487358805e-05, + "loss": 0.0005, + "num_tokens": 618105074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2611589997439618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004843816281392473, + "kl": 0.0124969482421875, + "learning_rate": 1.846282401849353e-05, + "loss": 0.0005, + "num_tokens": 618668706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2613296919006572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003976393093676821, + "kl": 0.0124053955078125, + "learning_rate": 1.8459648545631218e-05, + "loss": 0.0005, + "num_tokens": 619242562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26150038405735254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044231747332806426, + "kl": 0.01287841796875, + "learning_rate": 1.845647006989905e-05, + "loss": 0.0005, + "num_tokens": 619809010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26167107621404795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005662512029449803, + "kl": 0.0126953125, + "learning_rate": 1.8453288592425267e-05, + "loss": 0.0005, + "num_tokens": 620378274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26184176837074336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004532322062410923, + "kl": 0.0120391845703125, + "learning_rate": 1.845010411433918e-05, + "loss": 0.0005, + "num_tokens": 620940306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26201246052743876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034531402138215944, + "kl": 0.012603759765625, + "learning_rate": 1.8446916636771165e-05, + "loss": 0.0005, + "num_tokens": 621507810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26218315268413417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004477392427911161, + "kl": 0.0121917724609375, + "learning_rate": 1.844372616085266e-05, + "loss": 0.0005, + "num_tokens": 622070018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2623538448408296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005479438904964971, + "kl": 0.0123443603515625, + "learning_rate": 1.844053268771617e-05, + "loss": 0.0005, + "num_tokens": 622632066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.262524536997525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042043692248978876, + "kl": 0.01226806640625, + "learning_rate": 1.8437336218495258e-05, + "loss": 0.0005, + "num_tokens": 623194066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2626952291542204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038359178119885443, + "kl": 0.0122833251953125, + "learning_rate": 1.843413675432456e-05, + "loss": 0.0005, + "num_tokens": 623758930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26286592131091574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005085273386705324, + "kl": 0.01220703125, + "learning_rate": 1.843093429633977e-05, + "loss": 0.0005, + "num_tokens": 624329874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26303661346761115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002905081882548507, + "kl": 0.01239013671875, + "learning_rate": 1.842772884567764e-05, + "loss": 0.0005, + "num_tokens": 624899010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26320730562430655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003774082971078489, + "kl": 0.0122222900390625, + "learning_rate": 1.8424520403475997e-05, + "loss": 0.0005, + "num_tokens": 625469042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26337799778100196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004976108935402397, + "kl": 0.012451171875, + "learning_rate": 1.8421308970873722e-05, + "loss": 0.0005, + "num_tokens": 626031458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26354868993769737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012642908870499957, + "kl": 0.0126953125, + "learning_rate": 1.8418094549010747e-05, + "loss": 0.0005, + "num_tokens": 626596050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2637193820943928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001015733268664329, + "kl": 0.0129241943359375, + "learning_rate": 1.8414877139028087e-05, + "loss": 0.0005, + "num_tokens": 627158066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2638900742510882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006421897897527173, + "kl": 0.012481689453125, + "learning_rate": 1.8411656742067803e-05, + "loss": 0.0005, + "num_tokens": 627724866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2640607664077836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002716212021830987, + "kl": 0.0126495361328125, + "learning_rate": 1.8408433359273018e-05, + "loss": 0.0005, + "num_tokens": 628292434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26423145856447894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011836678974859586, + "kl": 0.01275634765625, + "learning_rate": 1.8405206991787918e-05, + "loss": 0.0005, + "num_tokens": 628857026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26440215072117434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021354731790438223, + "kl": 0.0121307373046875, + "learning_rate": 1.8401977640757742e-05, + "loss": 0.0005, + "num_tokens": 629420930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26457284287786975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045636832903242855, + "kl": 0.012603759765625, + "learning_rate": 1.8398745307328802e-05, + "loss": 0.0005, + "num_tokens": 629982578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26474353503456516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004124108783147749, + "kl": 0.0126495361328125, + "learning_rate": 1.8395509992648456e-05, + "loss": 0.0005, + "num_tokens": 630549762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26491422719126057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000995399374532838, + "kl": 0.01226806640625, + "learning_rate": 1.839227169786512e-05, + "loss": 0.0005, + "num_tokens": 631111474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26508491934795597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036852649951944674, + "kl": 0.012298583984375, + "learning_rate": 1.8389030424128277e-05, + "loss": 0.0005, + "num_tokens": 631674946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2652556115046514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030995505836582476, + "kl": 0.01214599609375, + "learning_rate": 1.838578617258846e-05, + "loss": 0.0005, + "num_tokens": 632245170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2654263036613468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002496575201078672, + "kl": 0.0121307373046875, + "learning_rate": 1.838253894439726e-05, + "loss": 0.0005, + "num_tokens": 632808290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26559699581804214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012025479896462559, + "kl": 0.014007568359375, + "learning_rate": 1.8379288740707326e-05, + "loss": 0.0006, + "num_tokens": 633377282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26576768797473754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006818979192989937, + "kl": 0.0123748779296875, + "learning_rate": 1.8376035562672366e-05, + "loss": 0.0005, + "num_tokens": 633945026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26593838013143295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037230451304579634, + "kl": 0.012481689453125, + "learning_rate": 1.837277941144714e-05, + "loss": 0.0005, + "num_tokens": 634510226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26610907228812836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004738627338278918, + "kl": 0.012664794921875, + "learning_rate": 1.836952028818746e-05, + "loss": 0.0005, + "num_tokens": 635074290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26627976444482376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005521829430884444, + "kl": 0.0120849609375, + "learning_rate": 1.8366258194050202e-05, + "loss": 0.0005, + "num_tokens": 635637202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26645045660151917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004236469605765348, + "kl": 0.0122833251953125, + "learning_rate": 1.836299313019329e-05, + "loss": 0.0005, + "num_tokens": 636198466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2666211487582146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003006233361744288, + "kl": 0.012420654296875, + "learning_rate": 1.83597250977757e-05, + "loss": 0.0005, + "num_tokens": 636764978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26679184091491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012681938064264557, + "kl": 0.0125579833984375, + "learning_rate": 1.8356454097957474e-05, + "loss": 0.0005, + "num_tokens": 637330978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26696253307160533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000486216322195559, + "kl": 0.012359619140625, + "learning_rate": 1.8353180131899687e-05, + "loss": 0.0005, + "num_tokens": 637894146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26713322522830074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005065642001742213, + "kl": 0.0124359130859375, + "learning_rate": 1.834990320076449e-05, + "loss": 0.0005, + "num_tokens": 638464210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26730391738499615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020621589781361104, + "kl": 0.0121917724609375, + "learning_rate": 1.8346623305715073e-05, + "loss": 0.0005, + "num_tokens": 639022658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26747460954169155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036351119405692955, + "kl": 0.0128173828125, + "learning_rate": 1.8343340447915675e-05, + "loss": 0.0005, + "num_tokens": 639590818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26764530169838696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003422353061921526, + "kl": 0.0127410888671875, + "learning_rate": 1.8340054628531598e-05, + "loss": 0.0005, + "num_tokens": 640151250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26781599385508237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005622940566049771, + "kl": 0.01318359375, + "learning_rate": 1.8336765848729183e-05, + "loss": 0.0005, + "num_tokens": 640720018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2679866860117778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005415007045541317, + "kl": 0.0122222900390625, + "learning_rate": 1.8333474109675834e-05, + "loss": 0.0005, + "num_tokens": 641278578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2681573781684732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003648663399655015, + "kl": 0.0132598876953125, + "learning_rate": 1.8330179412539996e-05, + "loss": 0.0005, + "num_tokens": 641841970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26832807032516853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012932095032189544, + "kl": 0.0123443603515625, + "learning_rate": 1.8326881758491174e-05, + "loss": 0.0005, + "num_tokens": 642406066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26849876248186394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006958220308391844, + "kl": 0.012786865234375, + "learning_rate": 1.8323581148699903e-05, + "loss": 0.0005, + "num_tokens": 642968066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26866945463855935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031287427765968267, + "kl": 0.0128936767578125, + "learning_rate": 1.8320277584337793e-05, + "loss": 0.0005, + "num_tokens": 643534674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26884014679525475, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.668352087006907e-05, + "kl": 0.0125885009765625, + "learning_rate": 1.8316971066577482e-05, + "loss": 0.0005, + "num_tokens": 644096482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26901083895195016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048121025280375567, + "kl": 0.0128326416015625, + "learning_rate": 1.8313661596592675e-05, + "loss": 0.0005, + "num_tokens": 644664386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26918153110864557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016153005365360617, + "kl": 0.01239013671875, + "learning_rate": 1.83103491755581e-05, + "loss": 0.0005, + "num_tokens": 645228418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.269352223265341, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004283908444213762, + "kl": 0.0130615234375, + "learning_rate": 1.8307033804649553e-05, + "loss": 0.0005, + "num_tokens": 645794882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2695229154220364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005420225001463817, + "kl": 0.0123291015625, + "learning_rate": 1.830371548504388e-05, + "loss": 0.0005, + "num_tokens": 646361090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26969360757873173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004090133935325598, + "kl": 0.0126190185546875, + "learning_rate": 1.830039421791895e-05, + "loss": 0.0005, + "num_tokens": 646923666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26986429973542714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006913869078505173, + "kl": 0.01300048828125, + "learning_rate": 1.8297070004453702e-05, + "loss": 0.0005, + "num_tokens": 647482082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27003499189212254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007016125551171108, + "kl": 0.012939453125, + "learning_rate": 1.829374284582811e-05, + "loss": 0.0005, + "num_tokens": 648043106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27020568404881795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015161093973150516, + "kl": 0.012481689453125, + "learning_rate": 1.8290412743223188e-05, + "loss": 0.0005, + "num_tokens": 648609234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27037637620551336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007351977843147883, + "kl": 0.012908935546875, + "learning_rate": 1.8287079697821018e-05, + "loss": 0.0005, + "num_tokens": 649173570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27054706836220876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002335900010293796, + "kl": 0.012054443359375, + "learning_rate": 1.82837437108047e-05, + "loss": 0.0005, + "num_tokens": 649744930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27071776051890417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000608658651461548, + "kl": 0.012542724609375, + "learning_rate": 1.8280404783358387e-05, + "loss": 0.0005, + "num_tokens": 650307506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2708884526755996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004833724837149789, + "kl": 0.0121917724609375, + "learning_rate": 1.8277062916667283e-05, + "loss": 0.0005, + "num_tokens": 650880866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27105914483229493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006757852681258551, + "kl": 0.0121307373046875, + "learning_rate": 1.8273718111917627e-05, + "loss": 0.0005, + "num_tokens": 651459666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27122983698899034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014622553354505983, + "kl": 0.0122833251953125, + "learning_rate": 1.8270370370296707e-05, + "loss": 0.0005, + "num_tokens": 652020690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27140052914568574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044125074650392765, + "kl": 0.0128021240234375, + "learning_rate": 1.826701969299285e-05, + "loss": 0.0005, + "num_tokens": 652582034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27157122130238115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005583202206096826, + "kl": 0.012786865234375, + "learning_rate": 1.8263666081195423e-05, + "loss": 0.0005, + "num_tokens": 653146322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27174191345907656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006103611405209762, + "kl": 0.0124664306640625, + "learning_rate": 1.8260309536094837e-05, + "loss": 0.0005, + "num_tokens": 653712882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27191260561577196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001997296003556552, + "kl": 0.012969970703125, + "learning_rate": 1.8256950058882547e-05, + "loss": 0.0005, + "num_tokens": 654276034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27208329777246737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005885043341662722, + "kl": 0.0122528076171875, + "learning_rate": 1.8253587650751045e-05, + "loss": 0.0005, + "num_tokens": 654845682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2722539899291628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002314631079408568, + "kl": 0.01275634765625, + "learning_rate": 1.8250222312893866e-05, + "loss": 0.0005, + "num_tokens": 655407586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2724246820858581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006881221244994953, + "kl": 0.0120391845703125, + "learning_rate": 1.824685404650558e-05, + "loss": 0.0005, + "num_tokens": 655982834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27259537424255353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005197098481702097, + "kl": 0.012725830078125, + "learning_rate": 1.82434828527818e-05, + "loss": 0.0005, + "num_tokens": 656544562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27276606639924894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004382819391627453, + "kl": 0.012176513671875, + "learning_rate": 1.8240108732919185e-05, + "loss": 0.0005, + "num_tokens": 657107826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27293675855594435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025577059559117105, + "kl": 0.011810302734375, + "learning_rate": 1.8236731688115417e-05, + "loss": 0.0005, + "num_tokens": 657673794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27310745071263975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005087579198286902, + "kl": 0.0130615234375, + "learning_rate": 1.8233351719569227e-05, + "loss": 0.0005, + "num_tokens": 658237922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27327814286933516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004074116017715989, + "kl": 0.012603759765625, + "learning_rate": 1.8229968828480387e-05, + "loss": 0.0005, + "num_tokens": 658804386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27344883502603057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003398222357138151, + "kl": 0.012969970703125, + "learning_rate": 1.8226583016049698e-05, + "loss": 0.0005, + "num_tokens": 659369154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.273619527182726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046356814837985086, + "kl": 0.0126953125, + "learning_rate": 1.8223194283479e-05, + "loss": 0.0005, + "num_tokens": 659934274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2737902193394214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010817150073451247, + "kl": 0.012359619140625, + "learning_rate": 1.8219802631971167e-05, + "loss": 0.0005, + "num_tokens": 660499458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27396091149611673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034572372790006676, + "kl": 0.011505126953125, + "learning_rate": 1.8216408062730124e-05, + "loss": 0.0005, + "num_tokens": 661061842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27413160365281214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013296261736858044, + "kl": 0.0126800537109375, + "learning_rate": 1.821301057696081e-05, + "loss": 0.0005, + "num_tokens": 661628162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27430229580950755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006019351349414534, + "kl": 0.011749267578125, + "learning_rate": 1.8209610175869214e-05, + "loss": 0.0005, + "num_tokens": 662191026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27447298796620295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005569937461513186, + "kl": 0.0128173828125, + "learning_rate": 1.820620686066236e-05, + "loss": 0.0005, + "num_tokens": 662754450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27464368012289836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006963596557837446, + "kl": 0.01275634765625, + "learning_rate": 1.8202800632548292e-05, + "loss": 0.0005, + "num_tokens": 663317042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27481437227959377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043409464080815214, + "kl": 0.0122222900390625, + "learning_rate": 1.8199391492736107e-05, + "loss": 0.0005, + "num_tokens": 663881122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2749850644362892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016507433716182529, + "kl": 0.012664794921875, + "learning_rate": 1.819597944243592e-05, + "loss": 0.0005, + "num_tokens": 664448338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2751557565929846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018006486513168175, + "kl": 0.0123748779296875, + "learning_rate": 1.819256448285889e-05, + "loss": 0.0005, + "num_tokens": 665018418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27532644874967993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003125318523797538, + "kl": 0.012359619140625, + "learning_rate": 1.8189146615217205e-05, + "loss": 0.0005, + "num_tokens": 665603394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27549714090637534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012582866209449241, + "kl": 0.0126953125, + "learning_rate": 1.8185725840724083e-05, + "loss": 0.0005, + "num_tokens": 666170786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27566783306307074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019278736656125957, + "kl": 0.013214111328125, + "learning_rate": 1.8182302160593773e-05, + "loss": 0.0005, + "num_tokens": 666733874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27583852521976615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005712360567091613, + "kl": 0.0117645263671875, + "learning_rate": 1.8178875576041562e-05, + "loss": 0.0005, + "num_tokens": 667300210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27600921737646156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044661036636599533, + "kl": 0.012420654296875, + "learning_rate": 1.8175446088283762e-05, + "loss": 0.0005, + "num_tokens": 667864466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27617990953315696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004607508917665181, + "kl": 0.0120849609375, + "learning_rate": 1.8172013698537714e-05, + "loss": 0.0005, + "num_tokens": 668439794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27635060168985237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005412556143179372, + "kl": 0.0129852294921875, + "learning_rate": 1.81685784080218e-05, + "loss": 0.0005, + "num_tokens": 669001186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2765212938465478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018014859997942875, + "kl": 0.0124053955078125, + "learning_rate": 1.8165140217955417e-05, + "loss": 0.0005, + "num_tokens": 669567378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27669198600324313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005843540027991219, + "kl": 0.0120697021484375, + "learning_rate": 1.8161699129559002e-05, + "loss": 0.0005, + "num_tokens": 670132562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27686267815993854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006148067087472107, + "kl": 0.0126495361328125, + "learning_rate": 1.8158255144054018e-05, + "loss": 0.0005, + "num_tokens": 670698626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27703337031663394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004888021100350662, + "kl": 0.0125732421875, + "learning_rate": 1.815480826266295e-05, + "loss": 0.0005, + "num_tokens": 671263394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27720406247332935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005402781755974068, + "kl": 0.0128326416015625, + "learning_rate": 1.8151358486609324e-05, + "loss": 0.0005, + "num_tokens": 671832658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27737475463002476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016388123466922477, + "kl": 0.0125274658203125, + "learning_rate": 1.814790581711768e-05, + "loss": 0.0005, + "num_tokens": 672394898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27754544678672016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033435431097565958, + "kl": 0.0126800537109375, + "learning_rate": 1.81444502554136e-05, + "loss": 0.0005, + "num_tokens": 672964418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27771613894341557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021416766004384401, + "kl": 0.0125579833984375, + "learning_rate": 1.814099180272367e-05, + "loss": 0.0005, + "num_tokens": 673527362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.277886831100111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002666391451746685, + "kl": 0.0122528076171875, + "learning_rate": 1.8137530460275524e-05, + "loss": 0.0005, + "num_tokens": 674092210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2780575232568063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030658535132867037, + "kl": 0.012725830078125, + "learning_rate": 1.8134066229297812e-05, + "loss": 0.0005, + "num_tokens": 674662786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27822821541350173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026908287863376687, + "kl": 0.0132598876953125, + "learning_rate": 1.8130599111020215e-05, + "loss": 0.0005, + "num_tokens": 675228690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27839890757019714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020113066358638814, + "kl": 0.011962890625, + "learning_rate": 1.812712910667343e-05, + "loss": 0.0005, + "num_tokens": 675800370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27856959972689255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000412018595799956, + "kl": 0.01153564453125, + "learning_rate": 1.8123656217489184e-05, + "loss": 0.0005, + "num_tokens": 676366482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27874029188358795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004853136074860955, + "kl": 0.0133514404296875, + "learning_rate": 1.8120180444700227e-05, + "loss": 0.0005, + "num_tokens": 676930866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27891098404028336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004783106021226797, + "kl": 0.012176513671875, + "learning_rate": 1.8116701789540337e-05, + "loss": 0.0005, + "num_tokens": 677496690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27908167619697877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000500023235400908, + "kl": 0.0130767822265625, + "learning_rate": 1.8113220253244304e-05, + "loss": 0.0005, + "num_tokens": 678064002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2792523683536742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008018450052050262, + "kl": 0.012939453125, + "learning_rate": 1.8109735837047956e-05, + "loss": 0.0005, + "num_tokens": 678623970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2794230605103695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019753895299314268, + "kl": 0.01202392578125, + "learning_rate": 1.8106248542188125e-05, + "loss": 0.0005, + "num_tokens": 679190834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27959375266706493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005995861950643077, + "kl": 0.0126800537109375, + "learning_rate": 1.8102758369902684e-05, + "loss": 0.0005, + "num_tokens": 679759266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27976444482376034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007510240823183231, + "kl": 0.013092041015625, + "learning_rate": 1.8099265321430513e-05, + "loss": 0.0005, + "num_tokens": 680322322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.27993513698045575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047780617909439816, + "kl": 0.0124053955078125, + "learning_rate": 1.809576939801152e-05, + "loss": 0.0005, + "num_tokens": 680885970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28010582913715115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031632170283820213, + "kl": 0.0129852294921875, + "learning_rate": 1.809227060088663e-05, + "loss": 0.0005, + "num_tokens": 681449906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28027652129384656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006017245185163871, + "kl": 0.01251220703125, + "learning_rate": 1.8088768931297792e-05, + "loss": 0.0005, + "num_tokens": 682014722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28044721345054197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002299363397844884, + "kl": 0.012420654296875, + "learning_rate": 1.808526439048797e-05, + "loss": 0.0005, + "num_tokens": 682578450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2806179056072374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001703501110167719, + "kl": 0.0128326416015625, + "learning_rate": 1.8081756979701156e-05, + "loss": 0.0005, + "num_tokens": 683150370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2807885977639327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010808524321095857, + "kl": 0.012298583984375, + "learning_rate": 1.8078246700182345e-05, + "loss": 0.0005, + "num_tokens": 683717922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28095928992062813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003858870582147105, + "kl": 0.0124359130859375, + "learning_rate": 1.8074733553177562e-05, + "loss": 0.0005, + "num_tokens": 684284018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28112998207732354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005349846554678298, + "kl": 0.0127105712890625, + "learning_rate": 1.807121753993385e-05, + "loss": 0.0005, + "num_tokens": 684854722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28130067423401894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012638579931847753, + "kl": 0.0132904052734375, + "learning_rate": 1.8067698661699266e-05, + "loss": 0.0005, + "num_tokens": 685420898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28147136639071435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001409545636189973, + "kl": 0.01226806640625, + "learning_rate": 1.8064176919722885e-05, + "loss": 0.0005, + "num_tokens": 685986498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28164205854740976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045403818056341977, + "kl": 0.0127410888671875, + "learning_rate": 1.8060652315254797e-05, + "loss": 0.0005, + "num_tokens": 686551874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28181275070410516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003930245363613098, + "kl": 0.0136260986328125, + "learning_rate": 1.8057124849546107e-05, + "loss": 0.0005, + "num_tokens": 687116034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28198344286080057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001155701411366035, + "kl": 0.01226806640625, + "learning_rate": 1.8053594523848942e-05, + "loss": 0.0005, + "num_tokens": 687681010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2821541350174959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005436678952331473, + "kl": 0.0123291015625, + "learning_rate": 1.805006133941644e-05, + "loss": 0.0005, + "num_tokens": 688257074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28232482717419133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009848158963521262, + "kl": 0.013092041015625, + "learning_rate": 1.8046525297502754e-05, + "loss": 0.0005, + "num_tokens": 688830034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28249551933088674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002676584986486217, + "kl": 0.0122528076171875, + "learning_rate": 1.8042986399363053e-05, + "loss": 0.0005, + "num_tokens": 689395922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28266621148758214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001142785417123716, + "kl": 0.012847900390625, + "learning_rate": 1.803944464625351e-05, + "loss": 0.0005, + "num_tokens": 689964402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28283690364427755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005099044274619413, + "kl": 0.0127105712890625, + "learning_rate": 1.803590003943133e-05, + "loss": 0.0005, + "num_tokens": 690530354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28300759580097296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012919723142207313, + "kl": 0.0125732421875, + "learning_rate": 1.8032352580154708e-05, + "loss": 0.0005, + "num_tokens": 691098338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28317828795766836, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.640942626821821e-05, + "kl": 0.0124664306640625, + "learning_rate": 1.8028802269682878e-05, + "loss": 0.0005, + "num_tokens": 691666418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28334898011436377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029072895487546263, + "kl": 0.0120086669921875, + "learning_rate": 1.802524910927606e-05, + "loss": 0.0005, + "num_tokens": 692234306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2835196722710591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033828248541192574, + "kl": 0.01275634765625, + "learning_rate": 1.802169310019551e-05, + "loss": 0.0005, + "num_tokens": 692798402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2836903644277545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005097052053817209, + "kl": 0.0125579833984375, + "learning_rate": 1.801813424370347e-05, + "loss": 0.0005, + "num_tokens": 693363218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28386105658444993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005177502646505177, + "kl": 0.011871337890625, + "learning_rate": 1.8014572541063212e-05, + "loss": 0.0005, + "num_tokens": 693932674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28403174874114534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009158892413924381, + "kl": 0.012939453125, + "learning_rate": 1.8011007993539015e-05, + "loss": 0.0005, + "num_tokens": 694499906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28420244089784075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004970780297375316, + "kl": 0.013092041015625, + "learning_rate": 1.800744060239616e-05, + "loss": 0.0005, + "num_tokens": 695077042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28437313305453615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004656011012828543, + "kl": 0.0129852294921875, + "learning_rate": 1.800387036890094e-05, + "loss": 0.0005, + "num_tokens": 695646530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28454382521123156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000408361293377853, + "kl": 0.01287841796875, + "learning_rate": 1.8000297294320662e-05, + "loss": 0.0005, + "num_tokens": 696211922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28471451736792697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002474433179119656, + "kl": 0.012664794921875, + "learning_rate": 1.7996721379923643e-05, + "loss": 0.0005, + "num_tokens": 696784322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2848852095246223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003762803691553369, + "kl": 0.011871337890625, + "learning_rate": 1.79931426269792e-05, + "loss": 0.0005, + "num_tokens": 697347282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2850559016813177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026403425230358873, + "kl": 0.0124664306640625, + "learning_rate": 1.7989561036757653e-05, + "loss": 0.0005, + "num_tokens": 697915154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28522659383801313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004379214377189704, + "kl": 0.01251220703125, + "learning_rate": 1.7985976610530352e-05, + "loss": 0.0005, + "num_tokens": 698483666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28539728599470854, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.725273615206346e-05, + "kl": 0.0123291015625, + "learning_rate": 1.7982389349569626e-05, + "loss": 0.0005, + "num_tokens": 699048306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28556797815140395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017591341381373521, + "kl": 0.0123443603515625, + "learning_rate": 1.797879925514883e-05, + "loss": 0.0005, + "num_tokens": 699615778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28573867030809935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022875901981171025, + "kl": 0.0129241943359375, + "learning_rate": 1.7975206328542318e-05, + "loss": 0.0005, + "num_tokens": 700177826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28590936246479476, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.327073519972038e-05, + "kl": 0.0117645263671875, + "learning_rate": 1.797161057102545e-05, + "loss": 0.0005, + "num_tokens": 700745042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28608005462149017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006016949208704011, + "kl": 0.0122833251953125, + "learning_rate": 1.7968011983874584e-05, + "loss": 0.0005, + "num_tokens": 701310610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2862507467781855, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.9115099608900005e-05, + "kl": 0.0126800537109375, + "learning_rate": 1.7964410568367094e-05, + "loss": 0.0005, + "num_tokens": 701875458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2864214389348809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003133010649698791, + "kl": 0.0121612548828125, + "learning_rate": 1.796080632578135e-05, + "loss": 0.0005, + "num_tokens": 702435906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28659213109157633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039850146297678327, + "kl": 0.01263427734375, + "learning_rate": 1.795719925739673e-05, + "loss": 0.0005, + "num_tokens": 703000786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28676282324827174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010883672409057237, + "kl": 0.0123443603515625, + "learning_rate": 1.7953589364493612e-05, + "loss": 0.0005, + "num_tokens": 703563890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28693351540496714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003263093822569564, + "kl": 0.0125732421875, + "learning_rate": 1.7949976648353377e-05, + "loss": 0.0005, + "num_tokens": 704129314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28710420756166255, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.005415130208654e-05, + "kl": 0.01275634765625, + "learning_rate": 1.794636111025841e-05, + "loss": 0.0005, + "num_tokens": 704692770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28727489971835796, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.789402345181801e-05, + "kl": 0.01220703125, + "learning_rate": 1.7942742751492095e-05, + "loss": 0.0005, + "num_tokens": 705252658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28744559187505336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005200295309273096, + "kl": 0.0125732421875, + "learning_rate": 1.793912157333882e-05, + "loss": 0.0005, + "num_tokens": 705814610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2876162840317487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045104513325881383, + "kl": 0.011810302734375, + "learning_rate": 1.7935497577083974e-05, + "loss": 0.0005, + "num_tokens": 706380450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2877869761884441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041143134613039884, + "kl": 0.0124664306640625, + "learning_rate": 1.7931870764013942e-05, + "loss": 0.0005, + "num_tokens": 706949394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28795766834513953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019883710703729374, + "kl": 0.0125885009765625, + "learning_rate": 1.7928241135416114e-05, + "loss": 0.0005, + "num_tokens": 707512482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28812836050183493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014138539058434138, + "kl": 0.0126495361328125, + "learning_rate": 1.7924608692578874e-05, + "loss": 0.0005, + "num_tokens": 708080930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28829905265853034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015669578581033361, + "kl": 0.0121917724609375, + "learning_rate": 1.7920973436791607e-05, + "loss": 0.0005, + "num_tokens": 708645538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28846974481522575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007451324712183835, + "kl": 0.0129241943359375, + "learning_rate": 1.791733536934471e-05, + "loss": 0.0005, + "num_tokens": 709210274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28864043697192115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020193807999386548, + "kl": 0.012298583984375, + "learning_rate": 1.791369449152955e-05, + "loss": 0.0005, + "num_tokens": 709775218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28881112912861656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001392369392245115, + "kl": 0.011749267578125, + "learning_rate": 1.7910050804638514e-05, + "loss": 0.0005, + "num_tokens": 710337538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2889818212853119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001515014790563037, + "kl": 0.0128326416015625, + "learning_rate": 1.790640430996498e-05, + "loss": 0.0005, + "num_tokens": 710900626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2891525134420073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024277882356951337, + "kl": 0.012847900390625, + "learning_rate": 1.790275500880332e-05, + "loss": 0.0005, + "num_tokens": 711467106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2893232055987027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033877918081269055, + "kl": 0.0123748779296875, + "learning_rate": 1.7899102902448904e-05, + "loss": 0.0005, + "num_tokens": 712029554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28949389775539813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005995097847327878, + "kl": 0.011962890625, + "learning_rate": 1.7895447992198098e-05, + "loss": 0.0005, + "num_tokens": 712605234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28966458991209354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014815341847950562, + "kl": 0.0129241943359375, + "learning_rate": 1.7891790279348267e-05, + "loss": 0.0005, + "num_tokens": 713171138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28983528206878895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006474373282246222, + "kl": 0.012420654296875, + "learning_rate": 1.7888129765197762e-05, + "loss": 0.0005, + "num_tokens": 713735362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29000597422548435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023220864295164946, + "kl": 0.0124969482421875, + "learning_rate": 1.7884466451045937e-05, + "loss": 0.0005, + "num_tokens": 714301154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29017666638217976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000487603243510023, + "kl": 0.0123443603515625, + "learning_rate": 1.7880800338193134e-05, + "loss": 0.0005, + "num_tokens": 714868834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2903473585388751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004262386595342618, + "kl": 0.0120086669921875, + "learning_rate": 1.7877131427940693e-05, + "loss": 0.0005, + "num_tokens": 715434210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2905180506955705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010821390568355482, + "kl": 0.01287841796875, + "learning_rate": 1.787345972159094e-05, + "loss": 0.0005, + "num_tokens": 716002114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2906887428522659, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003534118351272533, + "kl": 0.0127105712890625, + "learning_rate": 1.7869785220447204e-05, + "loss": 0.0005, + "num_tokens": 716572210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29085943500896133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014526299916017696, + "kl": 0.0125579833984375, + "learning_rate": 1.78661079258138e-05, + "loss": 0.0005, + "num_tokens": 717138370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29103012716565674, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.816223551424011e-05, + "kl": 0.012359619140625, + "learning_rate": 1.7862427838996025e-05, + "loss": 0.0005, + "num_tokens": 717705458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29120081932235214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00103764590487452, + "kl": 0.01220703125, + "learning_rate": 1.785874496130019e-05, + "loss": 0.0005, + "num_tokens": 718267362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29137151147904755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023273217908591142, + "kl": 0.0124053955078125, + "learning_rate": 1.7855059294033575e-05, + "loss": 0.0005, + "num_tokens": 718827794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29154220363574296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004574623508074034, + "kl": 0.0123443603515625, + "learning_rate": 1.785137083850447e-05, + "loss": 0.0005, + "num_tokens": 719388546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29171289579243836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003242587017589023, + "kl": 0.0123443603515625, + "learning_rate": 1.7847679596022124e-05, + "loss": 0.0005, + "num_tokens": 719951138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2918835879491337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036818528033603437, + "kl": 0.012969970703125, + "learning_rate": 1.7843985567896817e-05, + "loss": 0.0005, + "num_tokens": 720517250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2920542801058291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011792358882269595, + "kl": 0.01275634765625, + "learning_rate": 1.7840288755439778e-05, + "loss": 0.0005, + "num_tokens": 721085266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29222497226252453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047491086460287843, + "kl": 0.012542724609375, + "learning_rate": 1.783658915996325e-05, + "loss": 0.0005, + "num_tokens": 721650130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29239566441921994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003290156634553239, + "kl": 0.012237548828125, + "learning_rate": 1.7832886782780455e-05, + "loss": 0.0005, + "num_tokens": 722228610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29256635657591534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004427784305744751, + "kl": 0.0118408203125, + "learning_rate": 1.7829181625205606e-05, + "loss": 0.0005, + "num_tokens": 722798946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29273704873261075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001649664452879347, + "kl": 0.0121002197265625, + "learning_rate": 1.7825473688553893e-05, + "loss": 0.0005, + "num_tokens": 723360914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29290774088930616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045727382655040967, + "kl": 0.0126190185546875, + "learning_rate": 1.7821762974141505e-05, + "loss": 0.0005, + "num_tokens": 723925890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29307843304600156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005574181636820548, + "kl": 0.0127410888671875, + "learning_rate": 1.7818049483285614e-05, + "loss": 0.0005, + "num_tokens": 724489858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2932491252026969, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009208270403414522, + "kl": 0.0120697021484375, + "learning_rate": 1.7814333217304368e-05, + "loss": 0.0005, + "num_tokens": 725058786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2934198173593923, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.669146178710992e-05, + "kl": 0.0121917724609375, + "learning_rate": 1.7810614177516913e-05, + "loss": 0.0005, + "num_tokens": 725619858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2935905095160877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022139014852385437, + "kl": 0.0125579833984375, + "learning_rate": 1.7806892365243374e-05, + "loss": 0.0005, + "num_tokens": 726185506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29376120167278313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010014066731931955, + "kl": 0.0125274658203125, + "learning_rate": 1.7803167781804857e-05, + "loss": 0.0005, + "num_tokens": 726749618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29393189382947854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003221721289427448, + "kl": 0.0124664306640625, + "learning_rate": 1.7799440428523452e-05, + "loss": 0.0005, + "num_tokens": 727310098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29410258598617395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001746029011688556, + "kl": 0.012054443359375, + "learning_rate": 1.7795710306722242e-05, + "loss": 0.0005, + "num_tokens": 727873586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29427327814286935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020853106144194178, + "kl": 0.012664794921875, + "learning_rate": 1.7791977417725278e-05, + "loss": 0.0005, + "num_tokens": 728440338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29444397029956476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004936137478531684, + "kl": 0.0124664306640625, + "learning_rate": 1.778824176285761e-05, + "loss": 0.0005, + "num_tokens": 729002450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2946146624562601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025057583093819422, + "kl": 0.0125274658203125, + "learning_rate": 1.7784503343445254e-05, + "loss": 0.0005, + "num_tokens": 729567010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2947853546129555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035875059763568754, + "kl": 0.012298583984375, + "learning_rate": 1.7780762160815212e-05, + "loss": 0.0005, + "num_tokens": 730131602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2949560467696509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001321270259324049, + "kl": 0.01251220703125, + "learning_rate": 1.7777018216295476e-05, + "loss": 0.0005, + "num_tokens": 730701106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29512673892634633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002640510454717496, + "kl": 0.0125732421875, + "learning_rate": 1.7773271511215008e-05, + "loss": 0.0005, + "num_tokens": 731266722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29529743108304174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033961605917553984, + "kl": 0.0117034912109375, + "learning_rate": 1.776952204690375e-05, + "loss": 0.0005, + "num_tokens": 731833170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29546812323973715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002412645564992061, + "kl": 0.012237548828125, + "learning_rate": 1.776576982469263e-05, + "loss": 0.0005, + "num_tokens": 732396722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29563881539643255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042263350311472474, + "kl": 0.012664794921875, + "learning_rate": 1.776201484591355e-05, + "loss": 0.0005, + "num_tokens": 732962466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29580950755312796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001780438119102699, + "kl": 0.012298583984375, + "learning_rate": 1.7758257111899393e-05, + "loss": 0.0005, + "num_tokens": 733531298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2959801997098233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018223923784127048, + "kl": 0.0131072998046875, + "learning_rate": 1.7754496623984015e-05, + "loss": 0.0005, + "num_tokens": 734101602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2961508918665187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000409211201190918, + "kl": 0.01220703125, + "learning_rate": 1.7750733383502264e-05, + "loss": 0.0005, + "num_tokens": 734669426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2963215840232141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011014031170077249, + "kl": 0.0120697021484375, + "learning_rate": 1.774696739178994e-05, + "loss": 0.0005, + "num_tokens": 735240962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29649227617990953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012584301406113487, + "kl": 0.0119171142578125, + "learning_rate": 1.7743198650183847e-05, + "loss": 0.0005, + "num_tokens": 735805474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29666296833660494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046043873058207284, + "kl": 0.0118255615234375, + "learning_rate": 1.7739427160021744e-05, + "loss": 0.0005, + "num_tokens": 736372034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29683366049330034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00069172627373603, + "kl": 0.012176513671875, + "learning_rate": 1.7735652922642377e-05, + "loss": 0.0005, + "num_tokens": 736931922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29700435264999575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046509178131645034, + "kl": 0.0121917724609375, + "learning_rate": 1.773187593938546e-05, + "loss": 0.0005, + "num_tokens": 737495682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29717504480669116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020486850696041628, + "kl": 0.012908935546875, + "learning_rate": 1.7728096211591696e-05, + "loss": 0.0005, + "num_tokens": 738062962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2973457369633865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015447208847517836, + "kl": 0.0121612548828125, + "learning_rate": 1.772431374060274e-05, + "loss": 0.0005, + "num_tokens": 738628226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2975164291200819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028665896353826834, + "kl": 0.0128173828125, + "learning_rate": 1.772052852776124e-05, + "loss": 0.0005, + "num_tokens": 739193474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2976871212767773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002752409339626876, + "kl": 0.01214599609375, + "learning_rate": 1.7716740574410804e-05, + "loss": 0.0005, + "num_tokens": 739759746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29785781343347273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020814120740433753, + "kl": 0.0134124755859375, + "learning_rate": 1.771294988189603e-05, + "loss": 0.0005, + "num_tokens": 740325842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29802850559016814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005377650695246861, + "kl": 0.01251220703125, + "learning_rate": 1.7709156451562462e-05, + "loss": 0.0005, + "num_tokens": 740890770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29819919774686354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034170719998665106, + "kl": 0.012176513671875, + "learning_rate": 1.770536028475664e-05, + "loss": 0.0005, + "num_tokens": 741455154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29836988990355895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013235706436553774, + "kl": 0.0125885009765625, + "learning_rate": 1.7701561382826068e-05, + "loss": 0.0005, + "num_tokens": 742017522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29854058206025436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002600486655345518, + "kl": 0.0131378173828125, + "learning_rate": 1.769775974711921e-05, + "loss": 0.0005, + "num_tokens": 742588754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2987112742169497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025126534165761466, + "kl": 0.0120849609375, + "learning_rate": 1.769395537898552e-05, + "loss": 0.0005, + "num_tokens": 743163954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2988819663736451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016857172401256172, + "kl": 0.012420654296875, + "learning_rate": 1.7690148279775402e-05, + "loss": 0.0005, + "num_tokens": 743731330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2990526585303405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034706227860153077, + "kl": 0.0124969482421875, + "learning_rate": 1.7686338450840246e-05, + "loss": 0.0005, + "num_tokens": 744291234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2992233506870359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016979281475154644, + "kl": 0.0121612548828125, + "learning_rate": 1.76825258935324e-05, + "loss": 0.0005, + "num_tokens": 744861362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29939404284373133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005022072167792556, + "kl": 0.0131072998046875, + "learning_rate": 1.7678710609205186e-05, + "loss": 0.0005, + "num_tokens": 745425698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29956473500042674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027294514790968476, + "kl": 0.0124664306640625, + "learning_rate": 1.7674892599212893e-05, + "loss": 0.0005, + "num_tokens": 745988402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29973542715712215, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.038040600770232e-05, + "kl": 0.0126495361328125, + "learning_rate": 1.7671071864910772e-05, + "loss": 0.0005, + "num_tokens": 746554818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29990611931381755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040517673095312283, + "kl": 0.012451171875, + "learning_rate": 1.766724840765505e-05, + "loss": 0.0005, + "num_tokens": 747132050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3000768114705129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003269058744254774, + "kl": 0.011993408203125, + "learning_rate": 1.7663422228802913e-05, + "loss": 0.0005, + "num_tokens": 747704450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3002475036272083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003897399666534683, + "kl": 0.0128173828125, + "learning_rate": 1.7659593329712522e-05, + "loss": 0.0005, + "num_tokens": 748278354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3004181957839037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003282140378331531, + "kl": 0.01214599609375, + "learning_rate": 1.765576171174299e-05, + "loss": 0.0005, + "num_tokens": 748855810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3005888879405991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026187324374587206, + "kl": 0.0124664306640625, + "learning_rate": 1.7651927376254407e-05, + "loss": 0.0005, + "num_tokens": 749420450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30075958009729453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011614367196845658, + "kl": 0.0121307373046875, + "learning_rate": 1.7648090324607827e-05, + "loss": 0.0005, + "num_tokens": 749981794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30093027225398994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016605462388329215, + "kl": 0.0123748779296875, + "learning_rate": 1.7644250558165262e-05, + "loss": 0.0005, + "num_tokens": 750550722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30110096441068535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003669143365019874, + "kl": 0.0123748779296875, + "learning_rate": 1.7640408078289693e-05, + "loss": 0.0005, + "num_tokens": 751115698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30127165656738075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045957603132779936, + "kl": 0.0130615234375, + "learning_rate": 1.7636562886345053e-05, + "loss": 0.0005, + "num_tokens": 751682738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3014423487240761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005898169275162518, + "kl": 0.0125732421875, + "learning_rate": 1.7632714983696258e-05, + "loss": 0.0005, + "num_tokens": 752248722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3016130408807715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003178374449567941, + "kl": 0.0121002197265625, + "learning_rate": 1.7628864371709165e-05, + "loss": 0.0005, + "num_tokens": 752814978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3017837330374669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048152712944242806, + "kl": 0.0124359130859375, + "learning_rate": 1.7625011051750608e-05, + "loss": 0.0005, + "num_tokens": 753381282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3019544251941623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012477623407751574, + "kl": 0.013519287109375, + "learning_rate": 1.7621155025188375e-05, + "loss": 0.0005, + "num_tokens": 753946658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30212511735085773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013470328083095822, + "kl": 0.0124664306640625, + "learning_rate": 1.7617296293391217e-05, + "loss": 0.0005, + "num_tokens": 754509714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30229580950755314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023132765918615632, + "kl": 0.01239013671875, + "learning_rate": 1.761343485772884e-05, + "loss": 0.0005, + "num_tokens": 755076002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30246650166424854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016499250046734555, + "kl": 0.0125885009765625, + "learning_rate": 1.7609570719571924e-05, + "loss": 0.0005, + "num_tokens": 755639346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30263719382094395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02365996297000051, + "kl": 0.034759521484375, + "learning_rate": 1.7605703880292084e-05, + "loss": 0.0014, + "num_tokens": 756239698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3028078859776393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003576425660223966, + "kl": 0.01263427734375, + "learning_rate": 1.7601834341261922e-05, + "loss": 0.0005, + "num_tokens": 756804850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3029785781343347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003423328176347009, + "kl": 0.01220703125, + "learning_rate": 1.7597962103854977e-05, + "loss": 0.0005, + "num_tokens": 757374594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3031492702910301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011121234118695254, + "kl": 0.0130462646484375, + "learning_rate": 1.7594087169445756e-05, + "loss": 0.0005, + "num_tokens": 757944738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3033199624477255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001467727404302518, + "kl": 0.012542724609375, + "learning_rate": 1.759020953940972e-05, + "loss": 0.0005, + "num_tokens": 758508722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30349065460442093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002504227036146712, + "kl": 0.01220703125, + "learning_rate": 1.7586329215123293e-05, + "loss": 0.0005, + "num_tokens": 759073314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30366134676111634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013293361048139887, + "kl": 0.011627197265625, + "learning_rate": 1.758244619796384e-05, + "loss": 0.0005, + "num_tokens": 759637634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30383203891781174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003316993576874526, + "kl": 0.0118255615234375, + "learning_rate": 1.75785604893097e-05, + "loss": 0.0005, + "num_tokens": 760198770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30400273107450715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002883477140423671, + "kl": 0.0125274658203125, + "learning_rate": 1.7574672090540158e-05, + "loss": 0.0005, + "num_tokens": 760763298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3041734232312025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001886591916194947, + "kl": 0.01300048828125, + "learning_rate": 1.757078100303545e-05, + "loss": 0.0005, + "num_tokens": 761330050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3043441153878979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007933235341800024, + "kl": 0.0120849609375, + "learning_rate": 1.756688722817678e-05, + "loss": 0.0005, + "num_tokens": 761888162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3045148075445933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004940038486526154, + "kl": 0.012420654296875, + "learning_rate": 1.7562990767346287e-05, + "loss": 0.0005, + "num_tokens": 762451394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3046854997012887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016379884396034699, + "kl": 0.012664794921875, + "learning_rate": 1.7559091621927085e-05, + "loss": 0.0005, + "num_tokens": 763020018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3048561918579841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005442228974761706, + "kl": 0.0125579833984375, + "learning_rate": 1.7555189793303222e-05, + "loss": 0.0005, + "num_tokens": 763582994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30502688401467953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012562425870245406, + "kl": 0.012451171875, + "learning_rate": 1.7551285282859712e-05, + "loss": 0.0005, + "num_tokens": 764150306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30519757617137494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005194631505126585, + "kl": 0.012176513671875, + "learning_rate": 1.7547378091982507e-05, + "loss": 0.0005, + "num_tokens": 764718930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30536826832807035, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.363045234677723e-05, + "kl": 0.0122528076171875, + "learning_rate": 1.7543468222058528e-05, + "loss": 0.0005, + "num_tokens": 765279106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3055389604847657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011142485859191875, + "kl": 0.0127105712890625, + "learning_rate": 1.7539555674475633e-05, + "loss": 0.0005, + "num_tokens": 765841794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3057096526414611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004498692287495116, + "kl": 0.0124969482421875, + "learning_rate": 1.7535640450622637e-05, + "loss": 0.0005, + "num_tokens": 766405698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3058803447981565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042342146046956986, + "kl": 0.0128021240234375, + "learning_rate": 1.75317225518893e-05, + "loss": 0.0005, + "num_tokens": 766968866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3060510369548519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023076413374347035, + "kl": 0.0121307373046875, + "learning_rate": 1.7527801979666338e-05, + "loss": 0.0005, + "num_tokens": 767542210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3062217291115473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002801141735543408, + "kl": 0.012542724609375, + "learning_rate": 1.752387873534541e-05, + "loss": 0.0005, + "num_tokens": 768112498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30639242126824273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002982986209023239, + "kl": 0.01226806640625, + "learning_rate": 1.751995282031913e-05, + "loss": 0.0005, + "num_tokens": 768677826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30656311342493814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035396278332312434, + "kl": 0.0124359130859375, + "learning_rate": 1.7516024235981053e-05, + "loss": 0.0005, + "num_tokens": 769238978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30673380558163355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005370609672863025, + "kl": 0.0133819580078125, + "learning_rate": 1.7512092983725684e-05, + "loss": 0.0005, + "num_tokens": 769802402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3069044977383289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000535332505605325, + "kl": 0.012451171875, + "learning_rate": 1.7508159064948475e-05, + "loss": 0.0005, + "num_tokens": 770368994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3070751898950243, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.348460362217197e-05, + "kl": 0.011688232421875, + "learning_rate": 1.7504222481045828e-05, + "loss": 0.0005, + "num_tokens": 770931698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3072458820517197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014569836664924207, + "kl": 0.01275634765625, + "learning_rate": 1.7500283233415086e-05, + "loss": 0.0005, + "num_tokens": 771493762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3074165742084151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0041368549092171475, + "kl": 0.0126495361328125, + "learning_rate": 1.7496341323454543e-05, + "loss": 0.0005, + "num_tokens": 772068562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3075872663651105, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.589681614016481e-05, + "kl": 0.0123748779296875, + "learning_rate": 1.7492396752563435e-05, + "loss": 0.0005, + "num_tokens": 772633986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30775795852180593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005371700581831916, + "kl": 0.0130615234375, + "learning_rate": 1.7488449522141935e-05, + "loss": 0.0005, + "num_tokens": 773199730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30792865067850134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008544852363941226, + "kl": 0.01202392578125, + "learning_rate": 1.7484499633591174e-05, + "loss": 0.0005, + "num_tokens": 773771586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30809934283519674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016714778982331875, + "kl": 0.0124359130859375, + "learning_rate": 1.748054708831322e-05, + "loss": 0.0005, + "num_tokens": 774338066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30827003499189215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005879347154677695, + "kl": 0.012725830078125, + "learning_rate": 1.7476591887711078e-05, + "loss": 0.0005, + "num_tokens": 774905682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3084407271485875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005165037088927917, + "kl": 0.0126190185546875, + "learning_rate": 1.7472634033188708e-05, + "loss": 0.0005, + "num_tokens": 775473970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3086114193052829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008154176259647495, + "kl": 0.0124969482421875, + "learning_rate": 1.7468673526151002e-05, + "loss": 0.0005, + "num_tokens": 776041842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3087821114619783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022299800491492076, + "kl": 0.012481689453125, + "learning_rate": 1.7464710368003797e-05, + "loss": 0.0005, + "num_tokens": 776607490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3089528036186737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007794923443837628, + "kl": 0.0123138427734375, + "learning_rate": 1.7460744560153873e-05, + "loss": 0.0005, + "num_tokens": 777170946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30912349577536913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000543187967571823, + "kl": 0.012847900390625, + "learning_rate": 1.7456776104008947e-05, + "loss": 0.0005, + "num_tokens": 777739170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30929418793206453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008556213632207276, + "kl": 0.0128021240234375, + "learning_rate": 1.745280500097768e-05, + "loss": 0.0005, + "num_tokens": 778305858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30946488008875994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007010926957868659, + "kl": 0.0122833251953125, + "learning_rate": 1.7448831252469665e-05, + "loss": 0.0005, + "num_tokens": 778868882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30963557224545535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000617759398821506, + "kl": 0.011962890625, + "learning_rate": 1.7444854859895445e-05, + "loss": 0.0005, + "num_tokens": 779437858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3098062644021507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033585006942050955, + "kl": 0.0123443603515625, + "learning_rate": 1.7440875824666496e-05, + "loss": 0.0005, + "num_tokens": 780000786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3099769565588461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006446267358497091, + "kl": 0.0123138427734375, + "learning_rate": 1.7436894148195227e-05, + "loss": 0.0005, + "num_tokens": 780570450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3101476487155415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048653205576913933, + "kl": 0.0130462646484375, + "learning_rate": 1.743290983189499e-05, + "loss": 0.0005, + "num_tokens": 781131906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3103183408722369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007836782139388922, + "kl": 0.0122528076171875, + "learning_rate": 1.742892287718008e-05, + "loss": 0.0005, + "num_tokens": 781698242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3104890330289323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000404263331834043, + "kl": 0.01226806640625, + "learning_rate": 1.742493328546571e-05, + "loss": 0.0005, + "num_tokens": 782263714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31065972518562773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005573215052566821, + "kl": 0.0130462646484375, + "learning_rate": 1.7420941058168056e-05, + "loss": 0.0005, + "num_tokens": 782826146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31083041734232314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031225249389571974, + "kl": 0.0127105712890625, + "learning_rate": 1.7416946196704203e-05, + "loss": 0.0005, + "num_tokens": 783391922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31100110949901855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035949371754868416, + "kl": 0.0124359130859375, + "learning_rate": 1.7412948702492184e-05, + "loss": 0.0005, + "num_tokens": 783962018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3111718016557139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003040418184316393, + "kl": 0.0125579833984375, + "learning_rate": 1.7408948576950973e-05, + "loss": 0.0005, + "num_tokens": 784527378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3113424938124093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003220332235353627, + "kl": 0.013519287109375, + "learning_rate": 1.7404945821500464e-05, + "loss": 0.0005, + "num_tokens": 785091698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3115131859691047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000340859836889691, + "kl": 0.0127410888671875, + "learning_rate": 1.740094043756149e-05, + "loss": 0.0005, + "num_tokens": 785658354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3116838781258001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045376985330541306, + "kl": 0.0124053955078125, + "learning_rate": 1.7396932426555818e-05, + "loss": 0.0005, + "num_tokens": 786218802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3118545702824955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002486671470572253, + "kl": 0.0129241943359375, + "learning_rate": 1.7392921789906153e-05, + "loss": 0.0005, + "num_tokens": 786784098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31202526243919093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003228471031409118, + "kl": 0.0124359130859375, + "learning_rate": 1.7388908529036115e-05, + "loss": 0.0005, + "num_tokens": 787355762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31219595459588634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048766811568970095, + "kl": 0.0124053955078125, + "learning_rate": 1.738489264537028e-05, + "loss": 0.0005, + "num_tokens": 787923298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31236664675258174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004417425603418462, + "kl": 0.0124969482421875, + "learning_rate": 1.7380874140334132e-05, + "loss": 0.0005, + "num_tokens": 788491538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3125373389092771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003985432977853551, + "kl": 0.0118255615234375, + "learning_rate": 1.73768530153541e-05, + "loss": 0.0005, + "num_tokens": 789068178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3127080310659725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000320775878385833, + "kl": 0.0119476318359375, + "learning_rate": 1.7372829271857532e-05, + "loss": 0.0005, + "num_tokens": 789643394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3128787232226679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020116181092555516, + "kl": 0.0124053955078125, + "learning_rate": 1.7368802911272723e-05, + "loss": 0.0005, + "num_tokens": 790209442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3130494153793633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001961188169858946, + "kl": 0.01287841796875, + "learning_rate": 1.7364773935028876e-05, + "loss": 0.0005, + "num_tokens": 790774786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3132201075360587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003734578313653678, + "kl": 0.01202392578125, + "learning_rate": 1.7360742344556133e-05, + "loss": 0.0005, + "num_tokens": 791338818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31339079969275413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011698114249794535, + "kl": 0.0121002197265625, + "learning_rate": 1.7356708141285568e-05, + "loss": 0.0005, + "num_tokens": 791902786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31356149184944954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023449270635161776, + "kl": 0.0118865966796875, + "learning_rate": 1.7352671326649174e-05, + "loss": 0.0005, + "num_tokens": 792469202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31373218400614494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000426641779297248, + "kl": 0.0123748779296875, + "learning_rate": 1.7348631902079877e-05, + "loss": 0.0005, + "num_tokens": 793033970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3139028761628403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000597094836745647, + "kl": 0.0127105712890625, + "learning_rate": 1.734458986901152e-05, + "loss": 0.0005, + "num_tokens": 793598594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3140735683195357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005952419624970999, + "kl": 0.0125274658203125, + "learning_rate": 1.7340545228878888e-05, + "loss": 0.0005, + "num_tokens": 794163586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3142442604762311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005591459200941353, + "kl": 0.0120086669921875, + "learning_rate": 1.7336497983117678e-05, + "loss": 0.0005, + "num_tokens": 794733970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3144149526329265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003636538581287501, + "kl": 0.0124969482421875, + "learning_rate": 1.7332448133164518e-05, + "loss": 0.0005, + "num_tokens": 795299234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3145856447896219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044761281525785884, + "kl": 0.01220703125, + "learning_rate": 1.7328395680456953e-05, + "loss": 0.0005, + "num_tokens": 795861714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3147563369463173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031326426572330415, + "kl": 0.0119476318359375, + "learning_rate": 1.732434062643346e-05, + "loss": 0.0005, + "num_tokens": 796438130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31492702910301273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003578353387848541, + "kl": 0.012359619140625, + "learning_rate": 1.7320282972533443e-05, + "loss": 0.0005, + "num_tokens": 797002066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31509772125970814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005147473430156165, + "kl": 0.0120849609375, + "learning_rate": 1.7316222720197216e-05, + "loss": 0.0005, + "num_tokens": 797570562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3152684134164035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001016764198950756, + "kl": 0.012603759765625, + "learning_rate": 1.7312159870866026e-05, + "loss": 0.0005, + "num_tokens": 798131730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3154391055730989, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003067330181263826, + "kl": 0.0124053955078125, + "learning_rate": 1.7308094425982032e-05, + "loss": 0.0005, + "num_tokens": 798697170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3156097977297943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003552226653269004, + "kl": 0.0128021240234375, + "learning_rate": 1.7304026386988327e-05, + "loss": 0.0005, + "num_tokens": 799263490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3157804898864897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002804952836620082, + "kl": 0.012603759765625, + "learning_rate": 1.7299955755328912e-05, + "loss": 0.0005, + "num_tokens": 799824306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3159511820431851, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.747668612405431, + "kl": 0.44970703125, + "learning_rate": 1.729588253244872e-05, + "loss": 0.018, + "num_tokens": 800418258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3161218741998805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002096789970349231, + "kl": 0.0126953125, + "learning_rate": 1.7291806719793595e-05, + "loss": 0.0005, + "num_tokens": 800984418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31629256635657593, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.953774066088306, + "kl": 0.672454833984375, + "learning_rate": 1.72877283188103e-05, + "loss": 0.0269, + "num_tokens": 801552242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31646325851327134, + "frac_reward_zero_std": 1.0, + "grad_norm": 158.67487023040655, + "kl": 26.3125, + "learning_rate": 1.7283647330946527e-05, + "loss": 1.0529, + "num_tokens": 802130402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 426.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 329.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3166339506699667, + "frac_reward_zero_std": 1.0, + "grad_norm": 58.71798801991741, + "kl": 11.234375, + "learning_rate": 1.7279563757650875e-05, + "loss": 0.4496, + "num_tokens": 802282610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 58.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3168046428266621, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.4656373277021713, + "kl": 8.34375, + "learning_rate": 1.7275477600372865e-05, + "loss": 0.334, + "num_tokens": 802342194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3169753349833575, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.288945240674271, + "kl": 9.359375, + "learning_rate": 1.727138886056294e-05, + "loss": 0.3748, + "num_tokens": 802393474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 108.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 83.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3171460271400529, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.7566739366464836, + "kl": 6.0, + "learning_rate": 1.7267297539672445e-05, + "loss": 0.2402, + "num_tokens": 802461250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 411.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 348.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3173167192967483, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7288181562709473, + "kl": 2.6015625, + "learning_rate": 1.7263203639153663e-05, + "loss": 0.1042, + "num_tokens": 802612338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3174874114534437, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.864258705658763, + "kl": 1.943359375, + "learning_rate": 1.7259107160459775e-05, + "loss": 0.0778, + "num_tokens": 803172418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31765810361013913, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7126883113435147, + "kl": 0.522216796875, + "learning_rate": 1.7255008105044884e-05, + "loss": 0.0209, + "num_tokens": 803744018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31782879576683454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21623908430888067, + "kl": 0.1748046875, + "learning_rate": 1.7250906474364008e-05, + "loss": 0.007, + "num_tokens": 804311698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3179994879235299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029208443611395234, + "kl": 0.037078857421875, + "learning_rate": 1.724680226987307e-05, + "loss": 0.0015, + "num_tokens": 804873538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3181701800802253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004205759881576929, + "kl": 0.015625, + "learning_rate": 1.7242695493028927e-05, + "loss": 0.0006, + "num_tokens": 805440930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3183408722369207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031986472682600546, + "kl": 0.0153045654296875, + "learning_rate": 1.7238586145289328e-05, + "loss": 0.0006, + "num_tokens": 806002914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3185115643936161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025110262035501437, + "kl": 0.014678955078125, + "learning_rate": 1.7234474228112947e-05, + "loss": 0.0006, + "num_tokens": 806570370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3186822565503115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022921017026292113, + "kl": 0.0142822265625, + "learning_rate": 1.7230359742959362e-05, + "loss": 0.0006, + "num_tokens": 807136834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3188529487070069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002116257662330757, + "kl": 0.0142822265625, + "learning_rate": 1.722624269128907e-05, + "loss": 0.0006, + "num_tokens": 807701602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31902364086370233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0044592656731176575, + "kl": 0.016387939453125, + "learning_rate": 1.7222123074563475e-05, + "loss": 0.0007, + "num_tokens": 808268930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.31919433302039774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011250499789517723, + "kl": 0.0167236328125, + "learning_rate": 1.7218000894244894e-05, + "loss": 0.0007, + "num_tokens": 808861266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3193650251770931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12958587708621458, + "kl": 0.1207275390625, + "learning_rate": 1.721387615179655e-05, + "loss": 0.0048, + "num_tokens": 809425682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3195357173337885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26249927397547207, + "kl": 0.22216796875, + "learning_rate": 1.7209748848682575e-05, + "loss": 0.0089, + "num_tokens": 809997826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3197064094904839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4151011391404204, + "kl": 0.337890625, + "learning_rate": 1.7205618986368016e-05, + "loss": 0.0135, + "num_tokens": 810563474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1821.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1348.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3198771016471793, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5442280720552781, + "kl": 1.66015625, + "learning_rate": 1.720148656631883e-05, + "loss": 0.0664, + "num_tokens": 811072466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 400.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3200477938038747, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.816678504057538, + "kl": 2.15625, + "learning_rate": 1.7197351590001865e-05, + "loss": 0.0863, + "num_tokens": 811217090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 134.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3202184859605701, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8597716617830948, + "kl": 2.203125, + "learning_rate": 1.71932140588849e-05, + "loss": 0.0881, + "num_tokens": 811289634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 159.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3203891781172655, + "frac_reward_zero_std": 1.0, + "grad_norm": 231.9435361273359, + "kl": 32.25, + "learning_rate": 1.71890739744366e-05, + "loss": 1.2892, + "num_tokens": 811396722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 310.625, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 199.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32055987027396093, + "frac_reward_zero_std": 1.0, + "grad_norm": 160.62786382886162, + "kl": 22.40625, + "learning_rate": 1.7184931338126554e-05, + "loss": 0.8942, + "num_tokens": 811514018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 343.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 300.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3207305624306563, + "frac_reward_zero_std": 1.0, + "grad_norm": 15.209026984998173, + "kl": 1.814453125, + "learning_rate": 1.718078615142524e-05, + "loss": 0.0726, + "num_tokens": 811638498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1285.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 905.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 679.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3209012545873517, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.655506128158386, + "kl": 1.626953125, + "learning_rate": 1.717663841580406e-05, + "loss": 0.0652, + "num_tokens": 811913506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3210719467440471, + "frac_reward_zero_std": 1.0, + "grad_norm": 15.399095636718222, + "kl": 1.8125, + "learning_rate": 1.71724881327353e-05, + "loss": 0.0725, + "num_tokens": 812478066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3212426389007425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016876718345556316, + "kl": 0.01226806640625, + "learning_rate": 1.7168335303692163e-05, + "loss": 0.0005, + "num_tokens": 813042946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3214133310574379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017440700345141102, + "kl": 0.012054443359375, + "learning_rate": 1.7164179930148757e-05, + "loss": 0.0005, + "num_tokens": 813615362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3215840232141333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002901727315288039, + "kl": 0.0124664306640625, + "learning_rate": 1.7160022013580088e-05, + "loss": 0.0005, + "num_tokens": 814196546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3217547153708287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020994873648813204, + "kl": 0.0132293701171875, + "learning_rate": 1.7155861555462068e-05, + "loss": 0.0005, + "num_tokens": 814761106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32192540752752413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016927898138418998, + "kl": 0.0130157470703125, + "learning_rate": 1.7151698557271506e-05, + "loss": 0.0005, + "num_tokens": 815325954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3220960996842195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008102446527231309, + "kl": 0.0128173828125, + "learning_rate": 1.7147533020486117e-05, + "loss": 0.0005, + "num_tokens": 815891618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3222667918409149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016796063108148366, + "kl": 0.013092041015625, + "learning_rate": 1.7143364946584517e-05, + "loss": 0.0005, + "num_tokens": 816456418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3224374839976103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009307062016438871, + "kl": 0.0124969482421875, + "learning_rate": 1.7139194337046225e-05, + "loss": 0.0005, + "num_tokens": 817021746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3226081761543057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00768545089414475, + "kl": 0.0133819580078125, + "learning_rate": 1.7135021193351647e-05, + "loss": 0.0005, + "num_tokens": 817585554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3227788683110011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018031554794259916, + "kl": 0.0123291015625, + "learning_rate": 1.713084551698211e-05, + "loss": 0.0005, + "num_tokens": 818165570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3229495604676965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017385256526428644, + "kl": 0.0125274658203125, + "learning_rate": 1.712666730941982e-05, + "loss": 0.0005, + "num_tokens": 818728338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3231202526243919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032708754647774684, + "kl": 0.012786865234375, + "learning_rate": 1.7122486572147895e-05, + "loss": 0.0005, + "num_tokens": 819292722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32329094478108733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003684774895028626, + "kl": 0.0125274658203125, + "learning_rate": 1.7118303306650342e-05, + "loss": 0.0005, + "num_tokens": 819868786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3234616369377827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003102282986536548, + "kl": 0.012359619140625, + "learning_rate": 1.7114117514412073e-05, + "loss": 0.0005, + "num_tokens": 820430306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3236323290944781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001380963387352718, + "kl": 0.012542724609375, + "learning_rate": 1.710992919691889e-05, + "loss": 0.0005, + "num_tokens": 820992210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3238030212511735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002694877630550058, + "kl": 0.012542724609375, + "learning_rate": 1.7105738355657498e-05, + "loss": 0.0005, + "num_tokens": 821559394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3239737134078689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033149605431673386, + "kl": 0.01361083984375, + "learning_rate": 1.7101544992115487e-05, + "loss": 0.0005, + "num_tokens": 822122786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3241444055645643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001554855857296329, + "kl": 0.0119781494140625, + "learning_rate": 1.7097349107781362e-05, + "loss": 0.0005, + "num_tokens": 822698866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3243150977212597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025840529534143286, + "kl": 0.0125732421875, + "learning_rate": 1.70931507041445e-05, + "loss": 0.0005, + "num_tokens": 823264050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3244857898779551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004365008925664898, + "kl": 0.0131072998046875, + "learning_rate": 1.7088949782695187e-05, + "loss": 0.0005, + "num_tokens": 823832274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32465648203465053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010906156141362578, + "kl": 0.0120849609375, + "learning_rate": 1.7084746344924595e-05, + "loss": 0.0005, + "num_tokens": 824402178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32482717419134594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024683585090182057, + "kl": 0.0125885009765625, + "learning_rate": 1.7080540392324797e-05, + "loss": 0.0005, + "num_tokens": 824969874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3249978663480413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002900328964180331, + "kl": 0.0123291015625, + "learning_rate": 1.7076331926388753e-05, + "loss": 0.0005, + "num_tokens": 825535922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3251685585047367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011165748154665092, + "kl": 0.0126800537109375, + "learning_rate": 1.7072120948610314e-05, + "loss": 0.0005, + "num_tokens": 826103266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3253392506614321, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001299235190066831, + "kl": 0.0127716064453125, + "learning_rate": 1.706790746048423e-05, + "loss": 0.0005, + "num_tokens": 826669970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3255099428181275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022077260741258658, + "kl": 0.012237548828125, + "learning_rate": 1.706369146350613e-05, + "loss": 0.0005, + "num_tokens": 827233666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3256806349748229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040198788929302846, + "kl": 0.01239013671875, + "learning_rate": 1.705947295917255e-05, + "loss": 0.0005, + "num_tokens": 827798098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3258513271315183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020406134179734857, + "kl": 0.01263427734375, + "learning_rate": 1.70552519489809e-05, + "loss": 0.0005, + "num_tokens": 828361906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3260220192882137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046758873642806085, + "kl": 0.012481689453125, + "learning_rate": 1.7051028434429487e-05, + "loss": 0.0005, + "num_tokens": 828925202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32619271144490913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015572659886440915, + "kl": 0.0129852294921875, + "learning_rate": 1.7046802417017508e-05, + "loss": 0.0005, + "num_tokens": 829489314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3263634036016045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011864451202606113, + "kl": 0.0125274658203125, + "learning_rate": 1.7042573898245045e-05, + "loss": 0.0005, + "num_tokens": 830052786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3265340957582999, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046714862474109715, + "kl": 0.0128021240234375, + "learning_rate": 1.703834287961307e-05, + "loss": 0.0005, + "num_tokens": 830621938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3267047879149953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002121672852903689, + "kl": 0.0125732421875, + "learning_rate": 1.7034109362623446e-05, + "loss": 0.0005, + "num_tokens": 831184210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3268754800716907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005045202540187552, + "kl": 0.012420654296875, + "learning_rate": 1.7029873348778915e-05, + "loss": 0.0005, + "num_tokens": 831748578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3270461722283861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002571555223118908, + "kl": 0.0126800537109375, + "learning_rate": 1.7025634839583104e-05, + "loss": 0.0005, + "num_tokens": 832314146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3272168643850815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003866917264551708, + "kl": 0.0128631591796875, + "learning_rate": 1.7021393836540537e-05, + "loss": 0.0005, + "num_tokens": 832879650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3273875565417769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003016074226902394, + "kl": 0.012939453125, + "learning_rate": 1.701715034115662e-05, + "loss": 0.0005, + "num_tokens": 833441010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32755824869847233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000195483566749452, + "kl": 0.012481689453125, + "learning_rate": 1.7012904354937636e-05, + "loss": 0.0005, + "num_tokens": 834006690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3277289408551677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000155234040134557, + "kl": 0.012664794921875, + "learning_rate": 1.7008655879390753e-05, + "loss": 0.0005, + "num_tokens": 834572322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3278996330118631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001114440280923435, + "kl": 0.0126800537109375, + "learning_rate": 1.7004404916024032e-05, + "loss": 0.0005, + "num_tokens": 835141922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3280703251685585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032692627296614416, + "kl": 0.012542724609375, + "learning_rate": 1.700015146634641e-05, + "loss": 0.0005, + "num_tokens": 835705138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3282410173252539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037383310942503914, + "kl": 0.012664794921875, + "learning_rate": 1.6995895531867713e-05, + "loss": 0.0005, + "num_tokens": 836270834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3284117094819493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027454687153985203, + "kl": 0.0125732421875, + "learning_rate": 1.6991637114098634e-05, + "loss": 0.0005, + "num_tokens": 836841618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3285824016386447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005019787816041845, + "kl": 0.01239013671875, + "learning_rate": 1.698737621455077e-05, + "loss": 0.0005, + "num_tokens": 837412946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3287530937953401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040164384542113135, + "kl": 0.01348876953125, + "learning_rate": 1.698311283473657e-05, + "loss": 0.0005, + "num_tokens": 837978402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32892378595203553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004596348501900126, + "kl": 0.012725830078125, + "learning_rate": 1.6978846976169395e-05, + "loss": 0.0005, + "num_tokens": 838543986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3290944781087309, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.759937418619681e-05, + "kl": 0.012725830078125, + "learning_rate": 1.6974578640363466e-05, + "loss": 0.0005, + "num_tokens": 839109986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3292651702654263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038195248245940274, + "kl": 0.012115478515625, + "learning_rate": 1.6970307828833887e-05, + "loss": 0.0005, + "num_tokens": 839679442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3294358624221217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017553372217462586, + "kl": 0.012969970703125, + "learning_rate": 1.696603454309664e-05, + "loss": 0.0005, + "num_tokens": 840242610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3296065545788171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031647219945531564, + "kl": 0.0123291015625, + "learning_rate": 1.696175878466859e-05, + "loss": 0.0005, + "num_tokens": 840819810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3297772467355125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004755673037114292, + "kl": 0.013092041015625, + "learning_rate": 1.695748055506748e-05, + "loss": 0.0005, + "num_tokens": 841387138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3299479388922079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003393600470814839, + "kl": 0.0124969482421875, + "learning_rate": 1.6953199855811925e-05, + "loss": 0.0005, + "num_tokens": 841952162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3301186310489033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001108785102842185, + "kl": 0.0121307373046875, + "learning_rate": 1.694891668842141e-05, + "loss": 0.0005, + "num_tokens": 842519378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.33028932320559873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020465900978879583, + "kl": 0.01263427734375, + "learning_rate": 1.694463105441632e-05, + "loss": 0.0005, + "num_tokens": 843082514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3304600153622941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047909394824791697, + "kl": 0.0133514404296875, + "learning_rate": 1.6940342955317882e-05, + "loss": 0.0005, + "num_tokens": 843647730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3306307075189895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008189843650887651, + "kl": 0.012237548828125, + "learning_rate": 1.6936052392648236e-05, + "loss": 0.0005, + "num_tokens": 844214354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3308013996756849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015665558352017273, + "kl": 0.013214111328125, + "learning_rate": 1.6931759367930364e-05, + "loss": 0.0005, + "num_tokens": 844776930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3309720918323803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004171880558980693, + "kl": 0.0124969482421875, + "learning_rate": 1.692746388268814e-05, + "loss": 0.0005, + "num_tokens": 845349474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3311427839890757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035783257888957797, + "kl": 0.01239013671875, + "learning_rate": 1.6923165938446303e-05, + "loss": 0.0005, + "num_tokens": 845912482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3313134761457711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030608337160937437, + "kl": 0.0119171142578125, + "learning_rate": 1.6918865536730473e-05, + "loss": 0.0005, + "num_tokens": 846474162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3314841683024665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002327630050657348, + "kl": 0.0124969482421875, + "learning_rate": 1.6914562679067132e-05, + "loss": 0.0005, + "num_tokens": 847064466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3316548604591619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003803251821758859, + "kl": 0.01275634765625, + "learning_rate": 1.691025736698364e-05, + "loss": 0.0005, + "num_tokens": 847631778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3318255526158573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003718830337389476, + "kl": 0.0128936767578125, + "learning_rate": 1.6905949602008228e-05, + "loss": 0.0005, + "num_tokens": 848193730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3319962447725527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005059713322822888, + "kl": 0.0126190185546875, + "learning_rate": 1.6901639385670002e-05, + "loss": 0.0005, + "num_tokens": 848760210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3321669369292481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024220573725425502, + "kl": 0.0125732421875, + "learning_rate": 1.6897326719498926e-05, + "loss": 0.0005, + "num_tokens": 849326178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3323376290859435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038968287131656726, + "kl": 0.012786865234375, + "learning_rate": 1.6893011605025845e-05, + "loss": 0.0005, + "num_tokens": 849895522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3325083212426389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004565986424725154, + "kl": 0.012786865234375, + "learning_rate": 1.688869404378247e-05, + "loss": 0.0005, + "num_tokens": 850464018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3326790133993343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003622047928051583, + "kl": 0.0126190185546875, + "learning_rate": 1.6884374037301375e-05, + "loss": 0.0005, + "num_tokens": 851028194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3328497055560297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006682957121298156, + "kl": 0.01263427734375, + "learning_rate": 1.6880051587116006e-05, + "loss": 0.0005, + "num_tokens": 851591810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3330203977127251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004033542316659911, + "kl": 0.0125579833984375, + "learning_rate": 1.6875726694760682e-05, + "loss": 0.0005, + "num_tokens": 852165026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3331910898694205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013319700775173686, + "kl": 0.0130767822265625, + "learning_rate": 1.687139936177058e-05, + "loss": 0.0005, + "num_tokens": 852726482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3333617820261159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024548269510121947, + "kl": 0.0124359130859375, + "learning_rate": 1.6867069589681748e-05, + "loss": 0.0005, + "num_tokens": 853289330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3335324741828113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036990877899332255, + "kl": 0.012420654296875, + "learning_rate": 1.6862737380031095e-05, + "loss": 0.0005, + "num_tokens": 853852898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3337031663395067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001455726866606546, + "kl": 0.012451171875, + "learning_rate": 1.6858402734356406e-05, + "loss": 0.0005, + "num_tokens": 854421042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3338738584962021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019069218785397366, + "kl": 0.01300048828125, + "learning_rate": 1.685406565419632e-05, + "loss": 0.0005, + "num_tokens": 854987250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3340445506528975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025493917320574476, + "kl": 0.012420654296875, + "learning_rate": 1.6849726141090346e-05, + "loss": 0.0005, + "num_tokens": 855549906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3342152428095929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002371551111262338, + "kl": 0.0129852294921875, + "learning_rate": 1.684538419657885e-05, + "loss": 0.0005, + "num_tokens": 856112530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3343859349662883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004562774598083291, + "kl": 0.012542724609375, + "learning_rate": 1.684103982220307e-05, + "loss": 0.0005, + "num_tokens": 856676450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3345566271229837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048475920532743417, + "kl": 0.0122528076171875, + "learning_rate": 1.68366930195051e-05, + "loss": 0.0005, + "num_tokens": 857238594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3347273192796791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046983911799845795, + "kl": 0.0124053955078125, + "learning_rate": 1.68323437900279e-05, + "loss": 0.0005, + "num_tokens": 857803906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3348980114363745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041029530342601123, + "kl": 0.0131988525390625, + "learning_rate": 1.682799213531529e-05, + "loss": 0.0005, + "num_tokens": 858366882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3350687035930699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001772160700775185, + "kl": 0.0131072998046875, + "learning_rate": 1.6823638056911944e-05, + "loss": 0.0005, + "num_tokens": 858928786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3352393957497653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023120286006435013, + "kl": 0.0122833251953125, + "learning_rate": 1.6819281556363405e-05, + "loss": 0.0005, + "num_tokens": 859498834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3354100879064607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016057857038401257, + "kl": 0.012542724609375, + "learning_rate": 1.681492263521608e-05, + "loss": 0.0005, + "num_tokens": 860068722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3355807800631561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000554133028405435, + "kl": 0.0127105712890625, + "learning_rate": 1.6810561295017223e-05, + "loss": 0.0005, + "num_tokens": 860639666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3357514722198515, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.584252310375109e-05, + "kl": 0.0122833251953125, + "learning_rate": 1.680619753731495e-05, + "loss": 0.0005, + "num_tokens": 861204130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3359221643765469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019561617234736226, + "kl": 0.0124969482421875, + "learning_rate": 1.6801831363658245e-05, + "loss": 0.0005, + "num_tokens": 861770834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3360928565332423, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.380372897133189e-05, + "kl": 0.011810302734375, + "learning_rate": 1.679746277559694e-05, + "loss": 0.0005, + "num_tokens": 862344146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3362635486899377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007108135354349082, + "kl": 0.0126495361328125, + "learning_rate": 1.679309177468172e-05, + "loss": 0.0005, + "num_tokens": 862913874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3364342408466331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003789413059087967, + "kl": 0.012359619140625, + "learning_rate": 1.6788718362464134e-05, + "loss": 0.0005, + "num_tokens": 863480146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3366049330033285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005071190648553908, + "kl": 0.0137786865234375, + "learning_rate": 1.6784342540496595e-05, + "loss": 0.0006, + "num_tokens": 864049298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3367756251600239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028909904424186144, + "kl": 0.01226806640625, + "learning_rate": 1.677996431033235e-05, + "loss": 0.0005, + "num_tokens": 864613970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3369463173167193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002476266144646108, + "kl": 0.0121612548828125, + "learning_rate": 1.6775583673525518e-05, + "loss": 0.0005, + "num_tokens": 865182930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3371170094734147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013301783355940892, + "kl": 0.012359619140625, + "learning_rate": 1.677120063163107e-05, + "loss": 0.0005, + "num_tokens": 865746882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.33728770163011007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003275175916249813, + "kl": 0.012420654296875, + "learning_rate": 1.6766815186204816e-05, + "loss": 0.0005, + "num_tokens": 866313938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3374583937868055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032728133496686275, + "kl": 0.0128631591796875, + "learning_rate": 1.6762427338803446e-05, + "loss": 0.0005, + "num_tokens": 866879762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3376290859435009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007285610206473983, + "kl": 0.0131988525390625, + "learning_rate": 1.6758037090984477e-05, + "loss": 0.0005, + "num_tokens": 867449666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3377997781001963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001746971892451149, + "kl": 0.012542724609375, + "learning_rate": 1.6753644444306293e-05, + "loss": 0.0005, + "num_tokens": 868015826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3379704702568917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036289798915915546, + "kl": 0.01190185546875, + "learning_rate": 1.6749249400328126e-05, + "loss": 0.0005, + "num_tokens": 868586066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3381411624135871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039872024503274315, + "kl": 0.0128173828125, + "learning_rate": 1.6744851960610056e-05, + "loss": 0.0005, + "num_tokens": 869155282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3383118545702825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035759203339224123, + "kl": 0.01214599609375, + "learning_rate": 1.674045212671301e-05, + "loss": 0.0005, + "num_tokens": 869722978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3384825467269779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045340557966659455, + "kl": 0.0123748779296875, + "learning_rate": 1.6736049900198777e-05, + "loss": 0.0005, + "num_tokens": 870290162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.33865323888367327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041883234228782276, + "kl": 0.0132904052734375, + "learning_rate": 1.6731645282629992e-05, + "loss": 0.0005, + "num_tokens": 870859490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3388239310403687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045652931860543956, + "kl": 0.0126800537109375, + "learning_rate": 1.6727238275570123e-05, + "loss": 0.0005, + "num_tokens": 871425602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3389946231970641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025898426875421726, + "kl": 0.012725830078125, + "learning_rate": 1.672282888058351e-05, + "loss": 0.0005, + "num_tokens": 871991586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3391653153537595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002891689589195779, + "kl": 0.013214111328125, + "learning_rate": 1.6718417099235323e-05, + "loss": 0.0005, + "num_tokens": 872558530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3393360075104549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038805126257585175, + "kl": 0.01324462890625, + "learning_rate": 1.6714002933091588e-05, + "loss": 0.0005, + "num_tokens": 873122338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3395066996671503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004501418040044133, + "kl": 0.0125732421875, + "learning_rate": 1.6709586383719175e-05, + "loss": 0.0005, + "num_tokens": 873685522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3396773918238457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002919446399344181, + "kl": 0.013336181640625, + "learning_rate": 1.670516745268579e-05, + "loss": 0.0005, + "num_tokens": 874253362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3398480839805411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005977960912684067, + "kl": 0.0128021240234375, + "learning_rate": 1.670074614156001e-05, + "loss": 0.0005, + "num_tokens": 874816594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34001877613723647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004684128124019911, + "kl": 0.0124053955078125, + "learning_rate": 1.6696322451911232e-05, + "loss": 0.0005, + "num_tokens": 875382434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3401894682939319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045114433773546807, + "kl": 0.01177978515625, + "learning_rate": 1.66918963853097e-05, + "loss": 0.0005, + "num_tokens": 875951378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3403601604506273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022490412034855962, + "kl": 0.012847900390625, + "learning_rate": 1.6687467943326525e-05, + "loss": 0.0005, + "num_tokens": 876515362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3405308526073227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029419625589750754, + "kl": 0.0119781494140625, + "learning_rate": 1.6683037127533625e-05, + "loss": 0.0005, + "num_tokens": 877082258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3407015447640181, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000494809993240029, + "kl": 0.0125885009765625, + "learning_rate": 1.6678603939503796e-05, + "loss": 0.0005, + "num_tokens": 877649554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3408722369207135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006923091949818651, + "kl": 0.012420654296875, + "learning_rate": 1.6674168380810647e-05, + "loss": 0.0005, + "num_tokens": 878210802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3410429290774089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005598486892232485, + "kl": 0.0129241943359375, + "learning_rate": 1.6669730453028646e-05, + "loss": 0.0005, + "num_tokens": 878775714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3412136212341043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042908146446100546, + "kl": 0.01312255859375, + "learning_rate": 1.66652901577331e-05, + "loss": 0.0005, + "num_tokens": 879338242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34138431339079967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002514303811614682, + "kl": 0.01300048828125, + "learning_rate": 1.6660847496500153e-05, + "loss": 0.0005, + "num_tokens": 879908866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34155500554749507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003648261361599671, + "kl": 0.012420654296875, + "learning_rate": 1.6656402470906785e-05, + "loss": 0.0005, + "num_tokens": 880475362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3417256977041905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030731970674800936, + "kl": 0.0126495361328125, + "learning_rate": 1.665195508253082e-05, + "loss": 0.0005, + "num_tokens": 881049986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3418963898608859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023870687140603808, + "kl": 0.012451171875, + "learning_rate": 1.6647505332950925e-05, + "loss": 0.0005, + "num_tokens": 881620386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3420670820175813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003398525629960782, + "kl": 0.01251220703125, + "learning_rate": 1.66430532237466e-05, + "loss": 0.0005, + "num_tokens": 882186114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3422377741742767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002775326479595277, + "kl": 0.012115478515625, + "learning_rate": 1.6638598756498178e-05, + "loss": 0.0005, + "num_tokens": 882761394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3424084663309721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005141989122829096, + "kl": 0.01263427734375, + "learning_rate": 1.6634141932786837e-05, + "loss": 0.0005, + "num_tokens": 883322290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3425791584876675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011704131896474856, + "kl": 0.0128631591796875, + "learning_rate": 1.662968275419459e-05, + "loss": 0.0005, + "num_tokens": 883885698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3427498506443629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024735580671677497, + "kl": 0.012542724609375, + "learning_rate": 1.6625221222304284e-05, + "loss": 0.0005, + "num_tokens": 884453346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34292054280105827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003690693977346925, + "kl": 0.0127105712890625, + "learning_rate": 1.66207573386996e-05, + "loss": 0.0005, + "num_tokens": 885015218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3430912349577537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003517534560804292, + "kl": 0.0123443603515625, + "learning_rate": 1.6616291104965056e-05, + "loss": 0.0005, + "num_tokens": 885587586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3432619271144491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048796839818528425, + "kl": 0.0129547119140625, + "learning_rate": 1.6611822522686e-05, + "loss": 0.0005, + "num_tokens": 886152098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3434326192711445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043756274909632014, + "kl": 0.012481689453125, + "learning_rate": 1.6607351593448626e-05, + "loss": 0.0005, + "num_tokens": 886713058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3436033114278399, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016918646002799765, + "kl": 0.012359619140625, + "learning_rate": 1.660287831883995e-05, + "loss": 0.0005, + "num_tokens": 887279490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3437740035845353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032129664624943455, + "kl": 0.012298583984375, + "learning_rate": 1.6598402700447816e-05, + "loss": 0.0005, + "num_tokens": 887840754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3439446957412307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005448394595018686, + "kl": 0.0125579833984375, + "learning_rate": 1.6593924739860917e-05, + "loss": 0.0005, + "num_tokens": 888405570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3441153878979261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019698749251237672, + "kl": 0.012939453125, + "learning_rate": 1.658944443866876e-05, + "loss": 0.0005, + "num_tokens": 888974098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34428608005462147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000426723621732049, + "kl": 0.0130157470703125, + "learning_rate": 1.658496179846169e-05, + "loss": 0.0005, + "num_tokens": 889542834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3444567722113169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003640938658763716, + "kl": 0.0117645263671875, + "learning_rate": 1.658047682083089e-05, + "loss": 0.0005, + "num_tokens": 890114610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3446274643680123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018911366306362858, + "kl": 0.0124053955078125, + "learning_rate": 1.6575989507368356e-05, + "loss": 0.0005, + "num_tokens": 890681298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3447981565247077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004278543786223208, + "kl": 0.0125885009765625, + "learning_rate": 1.6571499859666928e-05, + "loss": 0.0005, + "num_tokens": 891244914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3449688486814031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015058081501655296, + "kl": 0.0125885009765625, + "learning_rate": 1.6567007879320266e-05, + "loss": 0.0005, + "num_tokens": 891810706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3451395408380985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003327735811638933, + "kl": 0.0119171142578125, + "learning_rate": 1.6562513567922867e-05, + "loss": 0.0005, + "num_tokens": 892374546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3453102329947939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016698376147410862, + "kl": 0.0126953125, + "learning_rate": 1.6558016927070043e-05, + "loss": 0.0005, + "num_tokens": 892939650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3454809251514893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019529100148302456, + "kl": 0.012939453125, + "learning_rate": 1.655351795835794e-05, + "loss": 0.0005, + "num_tokens": 893502242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34565161730818467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011416123664056377, + "kl": 0.012054443359375, + "learning_rate": 1.6549016663383527e-05, + "loss": 0.0005, + "num_tokens": 894078594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3458223094648801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016705705855786724, + "kl": 0.0128631591796875, + "learning_rate": 1.6544513043744606e-05, + "loss": 0.0005, + "num_tokens": 894640946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3459930016215755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005330329835732067, + "kl": 0.012542724609375, + "learning_rate": 1.6540007101039796e-05, + "loss": 0.0005, + "num_tokens": 895198018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3461636937782709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000252808463583753, + "kl": 0.012664794921875, + "learning_rate": 1.6535498836868548e-05, + "loss": 0.0005, + "num_tokens": 895765186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3463343859349663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036036733948891956, + "kl": 0.012359619140625, + "learning_rate": 1.6530988252831134e-05, + "loss": 0.0005, + "num_tokens": 896326738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3465050780916617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022697640582680225, + "kl": 0.0126190185546875, + "learning_rate": 1.652647535052864e-05, + "loss": 0.0005, + "num_tokens": 896895794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3466757702483571, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033173945232539405, + "kl": 0.0126800537109375, + "learning_rate": 1.6521960131562993e-05, + "loss": 0.0005, + "num_tokens": 897458642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3468464624050525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001504678530918397, + "kl": 0.0123748779296875, + "learning_rate": 1.6517442597536926e-05, + "loss": 0.0005, + "num_tokens": 898023170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34701715456174786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038035329055013847, + "kl": 0.0124053955078125, + "learning_rate": 1.6512922750054003e-05, + "loss": 0.0005, + "num_tokens": 898583746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34718784671844327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036637157344913843, + "kl": 0.0125274658203125, + "learning_rate": 1.650840059071861e-05, + "loss": 0.0005, + "num_tokens": 899162914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3473585388751387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030783611762729275, + "kl": 0.0120697021484375, + "learning_rate": 1.650387612113594e-05, + "loss": 0.0005, + "num_tokens": 899729458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3475292310318341, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.230808587524943e-05, + "kl": 0.01220703125, + "learning_rate": 1.6499349342912033e-05, + "loss": 0.0005, + "num_tokens": 900300610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3476999231885295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002926867305163099, + "kl": 0.01275634765625, + "learning_rate": 1.6494820257653716e-05, + "loss": 0.0005, + "num_tokens": 900858306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3478706153452249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005333320328202863, + "kl": 0.0124664306640625, + "learning_rate": 1.6490288866968658e-05, + "loss": 0.0005, + "num_tokens": 901426002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3480413075019203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020355999076396509, + "kl": 0.012237548828125, + "learning_rate": 1.6485755172465338e-05, + "loss": 0.0005, + "num_tokens": 901991778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3482119996586157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001592274644496347, + "kl": 0.01263427734375, + "learning_rate": 1.6481219175753054e-05, + "loss": 0.0005, + "num_tokens": 902555170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34838269181531106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000222899424971742, + "kl": 0.0123291015625, + "learning_rate": 1.6476680878441925e-05, + "loss": 0.0005, + "num_tokens": 903115810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34855338397200647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022251912712671745, + "kl": 0.0122833251953125, + "learning_rate": 1.6472140282142874e-05, + "loss": 0.0005, + "num_tokens": 903678450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3487240761287019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025953291840555254, + "kl": 0.012969970703125, + "learning_rate": 1.6467597388467656e-05, + "loss": 0.0005, + "num_tokens": 904242866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3488947682853973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005775808394368556, + "kl": 0.0125274658203125, + "learning_rate": 1.646305219902883e-05, + "loss": 0.0005, + "num_tokens": 904805426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3490654604420927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008051344882323631, + "kl": 0.0120849609375, + "learning_rate": 1.645850471543978e-05, + "loss": 0.0005, + "num_tokens": 905366898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3492361525987881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022911618257182305, + "kl": 0.0120697021484375, + "learning_rate": 1.6453954939314693e-05, + "loss": 0.0005, + "num_tokens": 905934946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3494068447554835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012583427021113977, + "kl": 0.0128021240234375, + "learning_rate": 1.6449402872268574e-05, + "loss": 0.0005, + "num_tokens": 906494546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3495775369121789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001371271027076677, + "kl": 0.0125885009765625, + "learning_rate": 1.6444848515917248e-05, + "loss": 0.0005, + "num_tokens": 907054658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34974822906887426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003044132527535967, + "kl": 0.0122222900390625, + "learning_rate": 1.6440291871877343e-05, + "loss": 0.0005, + "num_tokens": 907625538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34991892122556967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001741020722185471, + "kl": 0.0126190185546875, + "learning_rate": 1.6435732941766308e-05, + "loss": 0.0005, + "num_tokens": 908189570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3500896133822651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001047868198030182, + "kl": 0.0126495361328125, + "learning_rate": 1.6431171727202393e-05, + "loss": 0.0005, + "num_tokens": 908751122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3502603055389605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000358057223004318, + "kl": 0.0124053955078125, + "learning_rate": 1.6426608229804665e-05, + "loss": 0.0005, + "num_tokens": 909319250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3504309976956559, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.544101758258079e-05, + "kl": 0.012359619140625, + "learning_rate": 1.6422042451193005e-05, + "loss": 0.0005, + "num_tokens": 909885874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3506016898523513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014367930113916852, + "kl": 0.0121307373046875, + "learning_rate": 1.6417474392988097e-05, + "loss": 0.0005, + "num_tokens": 910445186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3507723820090467, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.299157025206765e-05, + "kl": 0.012786865234375, + "learning_rate": 1.6412904056811436e-05, + "loss": 0.0005, + "num_tokens": 911009634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3509430741657421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018800316132426542, + "kl": 0.012298583984375, + "learning_rate": 1.640833144428533e-05, + "loss": 0.0005, + "num_tokens": 911571570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35111376632243746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022114979237529203, + "kl": 0.0121002197265625, + "learning_rate": 1.6403756557032885e-05, + "loss": 0.0005, + "num_tokens": 912136002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35128445847913287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003084620231598142, + "kl": 0.0124359130859375, + "learning_rate": 1.6399179396678028e-05, + "loss": 0.0005, + "num_tokens": 912699314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3514551506358283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011487749570281163, + "kl": 0.01263427734375, + "learning_rate": 1.639459996484548e-05, + "loss": 0.0005, + "num_tokens": 913263074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3516258427925237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020042474923260426, + "kl": 0.012847900390625, + "learning_rate": 1.639001826316078e-05, + "loss": 0.0005, + "num_tokens": 913838482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3517965349492191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008005721987898158, + "kl": 0.012939453125, + "learning_rate": 1.6385434293250257e-05, + "loss": 0.0005, + "num_tokens": 914403874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3519672271059145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010443989451168528, + "kl": 0.012176513671875, + "learning_rate": 1.6380848056741062e-05, + "loss": 0.0005, + "num_tokens": 914979426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3521379192626099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015110617332101723, + "kl": 0.012115478515625, + "learning_rate": 1.6376259555261148e-05, + "loss": 0.0005, + "num_tokens": 915546882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3523086114193053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044042042750423016, + "kl": 0.01214599609375, + "learning_rate": 1.6371668790439258e-05, + "loss": 0.0005, + "num_tokens": 916109346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35247930357600066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000824363291993909, + "kl": 0.012237548828125, + "learning_rate": 1.636707576390495e-05, + "loss": 0.0005, + "num_tokens": 916675154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35264999573269606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011294944059245651, + "kl": 0.0126800537109375, + "learning_rate": 1.6362480477288585e-05, + "loss": 0.0005, + "num_tokens": 917239490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35282068788939147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004734214583508285, + "kl": 0.012664794921875, + "learning_rate": 1.6357882932221323e-05, + "loss": 0.0005, + "num_tokens": 917804978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3529913800460869, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022672192976665702, + "kl": 0.01287841796875, + "learning_rate": 1.6353283130335126e-05, + "loss": 0.0005, + "num_tokens": 918365218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3531620722027823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022837615730433476, + "kl": 0.0122222900390625, + "learning_rate": 1.634868107326276e-05, + "loss": 0.0005, + "num_tokens": 918926578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3533327643594777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045105508583553005, + "kl": 0.0129241943359375, + "learning_rate": 1.6344076762637785e-05, + "loss": 0.0005, + "num_tokens": 919497954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3535034565161731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026964570993982714, + "kl": 0.012847900390625, + "learning_rate": 1.633947020009457e-05, + "loss": 0.0005, + "num_tokens": 920066434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3536741486728685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008302668339033496, + "kl": 0.0122833251953125, + "learning_rate": 1.6334861387268276e-05, + "loss": 0.0005, + "num_tokens": 920633714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35384484082956386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003890601960077649, + "kl": 0.0127410888671875, + "learning_rate": 1.633025032579486e-05, + "loss": 0.0005, + "num_tokens": 921200898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35401553298625926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002549147729437656, + "kl": 0.0127410888671875, + "learning_rate": 1.632563701731109e-05, + "loss": 0.0005, + "num_tokens": 921762594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35418622514295467, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.809581710124202e-05, + "kl": 0.012451171875, + "learning_rate": 1.632102146345452e-05, + "loss": 0.0005, + "num_tokens": 922322386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3543569172996501, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.356300179452858e-05, + "kl": 0.0122222900390625, + "learning_rate": 1.6316403665863506e-05, + "loss": 0.0005, + "num_tokens": 922887906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3545276094563455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003863121732756254, + "kl": 0.0126190185546875, + "learning_rate": 1.6311783626177203e-05, + "loss": 0.0005, + "num_tokens": 923450498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3546983016130409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036916850520548174, + "kl": 0.01251220703125, + "learning_rate": 1.6307161346035553e-05, + "loss": 0.0005, + "num_tokens": 924014754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3548689937697363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012155704870464358, + "kl": 0.0127410888671875, + "learning_rate": 1.63025368270793e-05, + "loss": 0.0005, + "num_tokens": 924578418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3550396859264317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004456459669142671, + "kl": 0.0124053955078125, + "learning_rate": 1.6297910070949986e-05, + "loss": 0.0005, + "num_tokens": 925141602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35521037808312705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009741132785140699, + "kl": 0.012481689453125, + "learning_rate": 1.6293281079289934e-05, + "loss": 0.0005, + "num_tokens": 925706066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35538107023982246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010294791765311071, + "kl": 0.0131378173828125, + "learning_rate": 1.6288649853742274e-05, + "loss": 0.0005, + "num_tokens": 926278274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35555176239651787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012519917218746697, + "kl": 0.012603759765625, + "learning_rate": 1.6284016395950924e-05, + "loss": 0.0005, + "num_tokens": 926842722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3557224545532133, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.336848727186932e-05, + "kl": 0.012725830078125, + "learning_rate": 1.627938070756059e-05, + "loss": 0.0005, + "num_tokens": 927406770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3558931467099087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039596265969189874, + "kl": 0.0127410888671875, + "learning_rate": 1.6274742790216783e-05, + "loss": 0.0005, + "num_tokens": 927980306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3560638388666041, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038403825616415864, + "kl": 0.0126953125, + "learning_rate": 1.627010264556579e-05, + "loss": 0.0005, + "num_tokens": 928547682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3562345310232995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004823793673306745, + "kl": 0.0126800537109375, + "learning_rate": 1.626546027525469e-05, + "loss": 0.0005, + "num_tokens": 929119314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3564052231799949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032256220470373777, + "kl": 0.012786865234375, + "learning_rate": 1.6260815680931368e-05, + "loss": 0.0005, + "num_tokens": 929687602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35657591533669025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003339856116187094, + "kl": 0.01239013671875, + "learning_rate": 1.6256168864244478e-05, + "loss": 0.0005, + "num_tokens": 930260194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35674660749338566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013236661759149, + "kl": 0.0132598876953125, + "learning_rate": 1.6251519826843477e-05, + "loss": 0.0005, + "num_tokens": 930825554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35691729965008107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003440570441589395, + "kl": 0.0129547119140625, + "learning_rate": 1.6246868570378608e-05, + "loss": 0.0005, + "num_tokens": 931392738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3570879918067765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043754304816078106, + "kl": 0.0123443603515625, + "learning_rate": 1.624221509650089e-05, + "loss": 0.0005, + "num_tokens": 931957714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3572586839634719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021989646585149965, + "kl": 0.0125885009765625, + "learning_rate": 1.6237559406862145e-05, + "loss": 0.0005, + "num_tokens": 932527506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3574293761201673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003406474009651155, + "kl": 0.0128936767578125, + "learning_rate": 1.6232901503114976e-05, + "loss": 0.0005, + "num_tokens": 933093538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3576000682768627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00610967212685745, + "kl": 0.013519287109375, + "learning_rate": 1.622824138691277e-05, + "loss": 0.0005, + "num_tokens": 933661954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3577707604335581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048154416367684235, + "kl": 0.0127716064453125, + "learning_rate": 1.6223579059909693e-05, + "loss": 0.0005, + "num_tokens": 934232130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35794145259025345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001911837906353473, + "kl": 0.0121917724609375, + "learning_rate": 1.6218914523760712e-05, + "loss": 0.0005, + "num_tokens": 934798850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35811214474694886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020197486539157616, + "kl": 0.01220703125, + "learning_rate": 1.6214247780121563e-05, + "loss": 0.0005, + "num_tokens": 935369810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35828283690364426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014815569189370633, + "kl": 0.0127105712890625, + "learning_rate": 1.620957883064877e-05, + "loss": 0.0005, + "num_tokens": 935936338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35845352906033967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001778249026210411, + "kl": 0.0122528076171875, + "learning_rate": 1.6204907676999652e-05, + "loss": 0.0005, + "num_tokens": 936505794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3586242212170351, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016556811421769568, + "kl": 0.01318359375, + "learning_rate": 1.620023432083229e-05, + "loss": 0.0005, + "num_tokens": 937069986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3587949133737305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005077190037185889, + "kl": 0.012664794921875, + "learning_rate": 1.619555876380556e-05, + "loss": 0.0005, + "num_tokens": 937634530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3589656055304259, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.604952731874026e-05, + "kl": 0.012176513671875, + "learning_rate": 1.6190881007579114e-05, + "loss": 0.0005, + "num_tokens": 938195906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3591362976871213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030641407257024026, + "kl": 0.012664794921875, + "learning_rate": 1.618620105381339e-05, + "loss": 0.0005, + "num_tokens": 938761954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3593069898438167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034252448881284484, + "kl": 0.0126953125, + "learning_rate": 1.61815189041696e-05, + "loss": 0.0005, + "num_tokens": 939329426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35947768200051206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011045081319104, + "kl": 0.01202392578125, + "learning_rate": 1.617683456030974e-05, + "loss": 0.0005, + "num_tokens": 939896802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35964837415720746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007986875791128704, + "kl": 0.01226806640625, + "learning_rate": 1.617214802389658e-05, + "loss": 0.0005, + "num_tokens": 940476546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35981906631390287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032620836933215956, + "kl": 0.0126495361328125, + "learning_rate": 1.6167459296593674e-05, + "loss": 0.0005, + "num_tokens": 941039730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3599897584705983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000795586675822304, + "kl": 0.012603759765625, + "learning_rate": 1.616276838006535e-05, + "loss": 0.0005, + "num_tokens": 941602002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3601604506272937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047188866231313067, + "kl": 0.0129241943359375, + "learning_rate": 1.6158075275976716e-05, + "loss": 0.0005, + "num_tokens": 942168402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3603311427839891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006343795766568433, + "kl": 0.0124359130859375, + "learning_rate": 1.6153379985993655e-05, + "loss": 0.0005, + "num_tokens": 942729090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3605018349406845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002754779440953787, + "kl": 0.013336181640625, + "learning_rate": 1.6148682511782817e-05, + "loss": 0.0005, + "num_tokens": 943297874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3606725270973799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012729252147889415, + "kl": 0.0125579833984375, + "learning_rate": 1.6143982855011646e-05, + "loss": 0.0005, + "num_tokens": 943857698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36084321925407525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005853113714199285, + "kl": 0.012359619140625, + "learning_rate": 1.6139281017348347e-05, + "loss": 0.0005, + "num_tokens": 944422626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36101391141077066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017467109154324761, + "kl": 0.01275634765625, + "learning_rate": 1.61345770004619e-05, + "loss": 0.0005, + "num_tokens": 944993170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36118460356746607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003793963067739694, + "kl": 0.0128173828125, + "learning_rate": 1.6129870806022066e-05, + "loss": 0.0005, + "num_tokens": 945557090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3613552957241615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001923600799754829, + "kl": 0.012359619140625, + "learning_rate": 1.6125162435699368e-05, + "loss": 0.0005, + "num_tokens": 946117986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3615259878808569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013333231867321206, + "kl": 0.012969970703125, + "learning_rate": 1.612045189116511e-05, + "loss": 0.0005, + "num_tokens": 946677586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3616966800375523, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.051099287353031e-05, + "kl": 0.0128631591796875, + "learning_rate": 1.6115739174091372e-05, + "loss": 0.0005, + "num_tokens": 947241506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3618673721942477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004823280351278989, + "kl": 0.01361083984375, + "learning_rate": 1.611102428615099e-05, + "loss": 0.0005, + "num_tokens": 947807042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3620380643509431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005575068749156692, + "kl": 0.012481689453125, + "learning_rate": 1.610630722901758e-05, + "loss": 0.0005, + "num_tokens": 948379218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36220875650763845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001964269365013376, + "kl": 0.0123748779296875, + "learning_rate": 1.6101588004365532e-05, + "loss": 0.0005, + "num_tokens": 948952114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36237944866433386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002457709634383021, + "kl": 0.013031005859375, + "learning_rate": 1.6096866613869993e-05, + "loss": 0.0005, + "num_tokens": 949513666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36255014082102927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041282340149342275, + "kl": 0.0120086669921875, + "learning_rate": 1.6092143059206894e-05, + "loss": 0.0005, + "num_tokens": 950080114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36272083297772467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005185589135669682, + "kl": 0.01300048828125, + "learning_rate": 1.6087417342052922e-05, + "loss": 0.0005, + "num_tokens": 950641858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3628915251344201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047651876554529767, + "kl": 0.0123291015625, + "learning_rate": 1.6082689464085538e-05, + "loss": 0.0005, + "num_tokens": 951202658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3630622172911155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001330645501961615, + "kl": 0.012481689453125, + "learning_rate": 1.607795942698296e-05, + "loss": 0.0005, + "num_tokens": 951771186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3632329094478109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029631878915897445, + "kl": 0.0125579833984375, + "learning_rate": 1.6073227232424192e-05, + "loss": 0.0005, + "num_tokens": 952336514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3634036016045063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001729084590550947, + "kl": 0.0128936767578125, + "learning_rate": 1.606849288208899e-05, + "loss": 0.0005, + "num_tokens": 952903826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36357429376120165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022556927630548642, + "kl": 0.013031005859375, + "learning_rate": 1.6063756377657873e-05, + "loss": 0.0005, + "num_tokens": 953464850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36374498591789706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005446871768690218, + "kl": 0.012603759765625, + "learning_rate": 1.6059017720812126e-05, + "loss": 0.0005, + "num_tokens": 954028034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36391567807459246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003617361120629172, + "kl": 0.0124664306640625, + "learning_rate": 1.6054276913233814e-05, + "loss": 0.0005, + "num_tokens": 954590738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36408637023128787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019309282996887115, + "kl": 0.0125885009765625, + "learning_rate": 1.604953395660574e-05, + "loss": 0.0005, + "num_tokens": 955153746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3642570623879833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005718832886826565, + "kl": 0.0130462646484375, + "learning_rate": 1.604478885261149e-05, + "loss": 0.0005, + "num_tokens": 955725154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3644277545446787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001924524301099238, + "kl": 0.0129852294921875, + "learning_rate": 1.6040041602935397e-05, + "loss": 0.0005, + "num_tokens": 956286002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3645984467013741, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.17531918517228e-05, + "kl": 0.0118865966796875, + "learning_rate": 1.603529220926257e-05, + "loss": 0.0005, + "num_tokens": 956856258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3647691388580695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027350874172783524, + "kl": 0.011962890625, + "learning_rate": 1.603054067327887e-05, + "loss": 0.0005, + "num_tokens": 957430018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36493983101476485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005908295913427057, + "kl": 0.0127105712890625, + "learning_rate": 1.6025786996670923e-05, + "loss": 0.0005, + "num_tokens": 957991650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36511052317146026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018593880663657039, + "kl": 0.012451171875, + "learning_rate": 1.602103118112611e-05, + "loss": 0.0005, + "num_tokens": 958556914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36528121532815566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007876314761310234, + "kl": 0.012939453125, + "learning_rate": 1.601627322833257e-05, + "loss": 0.0005, + "num_tokens": 959124338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36545190748485107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003453932680208595, + "kl": 0.012359619140625, + "learning_rate": 1.6011513139979214e-05, + "loss": 0.0005, + "num_tokens": 959696098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3656225996415465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004020892490944818, + "kl": 0.0123748779296875, + "learning_rate": 1.60067509177557e-05, + "loss": 0.0005, + "num_tokens": 960263058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3657932917982419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003595652039090701, + "kl": 0.0126190185546875, + "learning_rate": 1.6001986563352433e-05, + "loss": 0.0005, + "num_tokens": 960830674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3659639839549373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019363807130016447, + "kl": 0.01226806640625, + "learning_rate": 1.59972200784606e-05, + "loss": 0.0005, + "num_tokens": 961394994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3661346761116327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007033640693715698, + "kl": 0.0129547119140625, + "learning_rate": 1.5992451464772124e-05, + "loss": 0.0005, + "num_tokens": 961971426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36630536826832805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001937263118128344, + "kl": 0.0118560791015625, + "learning_rate": 1.59876807239797e-05, + "loss": 0.0005, + "num_tokens": 962533522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36647606042502345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003226539004288193, + "kl": 0.0125274658203125, + "learning_rate": 1.5982907857776752e-05, + "loss": 0.0005, + "num_tokens": 963094978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36664675258171886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005124914175291438, + "kl": 0.0131683349609375, + "learning_rate": 1.597813286785749e-05, + "loss": 0.0005, + "num_tokens": 963658898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36681744473841427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007173357155805647, + "kl": 0.012603759765625, + "learning_rate": 1.5973355755916856e-05, + "loss": 0.0005, + "num_tokens": 964222802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3669881368951097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000585110703025934, + "kl": 0.0122833251953125, + "learning_rate": 1.5968576523650556e-05, + "loss": 0.0005, + "num_tokens": 964790050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3671588290518051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004153807555216429, + "kl": 0.0128936767578125, + "learning_rate": 1.5963795172755044e-05, + "loss": 0.0005, + "num_tokens": 965360914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3673295212085005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042200816660815325, + "kl": 0.0130767822265625, + "learning_rate": 1.5959011704927525e-05, + "loss": 0.0005, + "num_tokens": 965924322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3675002133651959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000162183722758653, + "kl": 0.0120697021484375, + "learning_rate": 1.595422612186596e-05, + "loss": 0.0005, + "num_tokens": 966482226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36767090552189124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048798022443098085, + "kl": 0.0123748779296875, + "learning_rate": 1.5949438425269053e-05, + "loss": 0.0005, + "num_tokens": 967046034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36784159767858665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008868136359912536, + "kl": 0.0127716064453125, + "learning_rate": 1.594464861683627e-05, + "loss": 0.0005, + "num_tokens": 967608050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36801228983528206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020417784678207804, + "kl": 0.01251220703125, + "learning_rate": 1.593985669826782e-05, + "loss": 0.0005, + "num_tokens": 968167570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36818298199197746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048451840396346486, + "kl": 0.0125579833984375, + "learning_rate": 1.5935062671264655e-05, + "loss": 0.0005, + "num_tokens": 968743442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36835367414867287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000567369990831073, + "kl": 0.0125579833984375, + "learning_rate": 1.593026653752849e-05, + "loss": 0.0005, + "num_tokens": 969306370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3685243663053683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006084764988486312, + "kl": 0.0126190185546875, + "learning_rate": 1.592546829876177e-05, + "loss": 0.0005, + "num_tokens": 969871810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3686950584620637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046796271493864007, + "kl": 0.012481689453125, + "learning_rate": 1.5920667956667703e-05, + "loss": 0.0005, + "num_tokens": 970442162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3688657506187591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007385713029519288, + "kl": 0.0128631591796875, + "learning_rate": 1.5915865512950236e-05, + "loss": 0.0005, + "num_tokens": 971012274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36903644277545444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019779111455097152, + "kl": 0.012481689453125, + "learning_rate": 1.591106096931406e-05, + "loss": 0.0005, + "num_tokens": 971573858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36920713493214985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001357962042369821, + "kl": 0.0124053955078125, + "learning_rate": 1.5906254327464616e-05, + "loss": 0.0005, + "num_tokens": 972134834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36937782708884526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004781679838327271, + "kl": 0.0115814208984375, + "learning_rate": 1.5901445589108094e-05, + "loss": 0.0005, + "num_tokens": 972698834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36954851924554066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006105866397189061, + "kl": 0.0124664306640625, + "learning_rate": 1.5896634755951416e-05, + "loss": 0.0005, + "num_tokens": 973261138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36971921140223607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006939586739345534, + "kl": 0.0120849609375, + "learning_rate": 1.5891821829702253e-05, + "loss": 0.0005, + "num_tokens": 973831682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3698899035589315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002456070091595199, + "kl": 0.012603759765625, + "learning_rate": 1.5887006812069025e-05, + "loss": 0.0005, + "num_tokens": 974390850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3700605957156269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008702197095395934, + "kl": 0.012115478515625, + "learning_rate": 1.5882189704760887e-05, + "loss": 0.0005, + "num_tokens": 974957330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3702312878723223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041525301070944915, + "kl": 0.01263427734375, + "learning_rate": 1.587737050948774e-05, + "loss": 0.0005, + "num_tokens": 975524978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37040198002901764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003785755257252451, + "kl": 0.0125732421875, + "learning_rate": 1.587254922796022e-05, + "loss": 0.0005, + "num_tokens": 976089586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37057267218571305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005188778067246569, + "kl": 0.0129241943359375, + "learning_rate": 1.5867725861889714e-05, + "loss": 0.0005, + "num_tokens": 976654610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37074336434240845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007627289140536419, + "kl": 0.0128173828125, + "learning_rate": 1.586290041298834e-05, + "loss": 0.0005, + "num_tokens": 977221954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37091405649910386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005569332153839498, + "kl": 0.0130615234375, + "learning_rate": 1.5858072882968957e-05, + "loss": 0.0005, + "num_tokens": 977790402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37108474865579927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003449922652820331, + "kl": 0.01275634765625, + "learning_rate": 1.5853243273545164e-05, + "loss": 0.0005, + "num_tokens": 978356306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3712554408124947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002999118166785213, + "kl": 0.0129547119140625, + "learning_rate": 1.58484115864313e-05, + "loss": 0.0005, + "num_tokens": 978917538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3714261329691901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046527468729231096, + "kl": 0.01214599609375, + "learning_rate": 1.584357782334244e-05, + "loss": 0.0005, + "num_tokens": 979485682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3715968251258855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006223381917579818, + "kl": 0.0120849609375, + "learning_rate": 1.583874198599439e-05, + "loss": 0.0005, + "num_tokens": 980058930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37176751728258084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004717931853368759, + "kl": 0.012542724609375, + "learning_rate": 1.5833904076103703e-05, + "loss": 0.0005, + "num_tokens": 980620306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37193820943927625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001260076790368451, + "kl": 0.012451171875, + "learning_rate": 1.5829064095387662e-05, + "loss": 0.0005, + "num_tokens": 981185586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37210890159597165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036174695141597067, + "kl": 0.011932373046875, + "learning_rate": 1.582422204556428e-05, + "loss": 0.0005, + "num_tokens": 981748818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37227959375266706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005594665852537144, + "kl": 0.012054443359375, + "learning_rate": 1.581937792835232e-05, + "loss": 0.0005, + "num_tokens": 982317794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37245028590936247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045162912027000467, + "kl": 0.012420654296875, + "learning_rate": 1.581453174547126e-05, + "loss": 0.0005, + "num_tokens": 982884434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3726209780660579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030431676579700947, + "kl": 0.012664794921875, + "learning_rate": 1.5809683498641322e-05, + "loss": 0.0005, + "num_tokens": 983463938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3727916702227533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011892397764098535, + "kl": 0.0123138427734375, + "learning_rate": 1.580483318958346e-05, + "loss": 0.0005, + "num_tokens": 984032642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3729623623794487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003823778968341519, + "kl": 0.0126800537109375, + "learning_rate": 1.579998082001936e-05, + "loss": 0.0005, + "num_tokens": 984597986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37313305453614404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024022672216807668, + "kl": 0.0125885009765625, + "learning_rate": 1.5795126391671435e-05, + "loss": 0.0005, + "num_tokens": 985160274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37330374669283944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002978953317488757, + "kl": 0.0132904052734375, + "learning_rate": 1.5790269906262833e-05, + "loss": 0.0005, + "num_tokens": 985725922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37347443884953485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033073984693388905, + "kl": 0.01220703125, + "learning_rate": 1.578541136551743e-05, + "loss": 0.0005, + "num_tokens": 986291970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37364513100623026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003485317176972218, + "kl": 0.0120086669921875, + "learning_rate": 1.5780550771159835e-05, + "loss": 0.0005, + "num_tokens": 986856530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37381582316292566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002975573733065994, + "kl": 0.012603759765625, + "learning_rate": 1.5775688124915384e-05, + "loss": 0.0005, + "num_tokens": 987424098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37398651531962107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031423949960860575, + "kl": 0.0123138427734375, + "learning_rate": 1.5770823428510134e-05, + "loss": 0.0005, + "num_tokens": 987989666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3741572074763165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030434059904938204, + "kl": 0.0126495361328125, + "learning_rate": 1.5765956683670887e-05, + "loss": 0.0005, + "num_tokens": 988556098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3743278996330119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043430248065116, + "kl": 0.01220703125, + "learning_rate": 1.576108789212515e-05, + "loss": 0.0005, + "num_tokens": 989113154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37449859178970724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033457814050826315, + "kl": 0.01263427734375, + "learning_rate": 1.575621705560118e-05, + "loss": 0.0005, + "num_tokens": 989677778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37466928394640264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000850364837798258, + "kl": 0.0124664306640625, + "learning_rate": 1.575134417582794e-05, + "loss": 0.0005, + "num_tokens": 990242146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37483997610309805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003705393689774199, + "kl": 0.0119781494140625, + "learning_rate": 1.574646925453513e-05, + "loss": 0.0005, + "num_tokens": 990808834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37501066825979346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002513951129702735, + "kl": 0.012542724609375, + "learning_rate": 1.5741592293453175e-05, + "loss": 0.0005, + "num_tokens": 991371570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37518136041648886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001174112683220264, + "kl": 0.0127105712890625, + "learning_rate": 1.573671329431321e-05, + "loss": 0.0005, + "num_tokens": 991944898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37535205257318427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023640758383724843, + "kl": 0.012939453125, + "learning_rate": 1.5731832258847107e-05, + "loss": 0.0005, + "num_tokens": 992510418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3755227447298797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002610112647777215, + "kl": 0.0127105712890625, + "learning_rate": 1.5726949188787463e-05, + "loss": 0.0005, + "num_tokens": 993076706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3756934368865751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008876560741284182, + "kl": 0.013092041015625, + "learning_rate": 1.5722064085867587e-05, + "loss": 0.0005, + "num_tokens": 993642610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37586412904327043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003921706462130824, + "kl": 0.0126800537109375, + "learning_rate": 1.5717176951821514e-05, + "loss": 0.0005, + "num_tokens": 994205474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37603482119996584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005234728466541717, + "kl": 0.0125885009765625, + "learning_rate": 1.5712287788384006e-05, + "loss": 0.0005, + "num_tokens": 994773042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37620551335666125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007265755620651919, + "kl": 0.0124664306640625, + "learning_rate": 1.570739659729053e-05, + "loss": 0.0005, + "num_tokens": 995339762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37637620551335665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003924168228371354, + "kl": 0.0129241943359375, + "learning_rate": 1.570250338027729e-05, + "loss": 0.0005, + "num_tokens": 995900418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37654689767005206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018793875107434463, + "kl": 0.0125732421875, + "learning_rate": 1.5697608139081196e-05, + "loss": 0.0005, + "num_tokens": 996466722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37671758982674747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001334724460538072, + "kl": 0.0125732421875, + "learning_rate": 1.5692710875439888e-05, + "loss": 0.0005, + "num_tokens": 997037298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3768882819834429, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029310236465595257, + "kl": 0.01226806640625, + "learning_rate": 1.5687811591091713e-05, + "loss": 0.0005, + "num_tokens": 997600498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3770589741401383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037807831910491915, + "kl": 0.012298583984375, + "learning_rate": 1.568291028777574e-05, + "loss": 0.0005, + "num_tokens": 998159378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3772296662968337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030100132545876146, + "kl": 0.011871337890625, + "learning_rate": 1.5678006967231755e-05, + "loss": 0.0005, + "num_tokens": 998726210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37740035845352904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046970689314747017, + "kl": 0.0126495361328125, + "learning_rate": 1.5673101631200262e-05, + "loss": 0.0005, + "num_tokens": 999290546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37757105061022445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034101788644005384, + "kl": 0.0125732421875, + "learning_rate": 1.566819428142248e-05, + "loss": 0.0005, + "num_tokens": 999851218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37774174276691985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040007073941744056, + "kl": 0.012359619140625, + "learning_rate": 1.5663284919640336e-05, + "loss": 0.0005, + "num_tokens": 1000425714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37791243492361526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014296062835356666, + "kl": 0.011871337890625, + "learning_rate": 1.565837354759648e-05, + "loss": 0.0005, + "num_tokens": 1000995698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37808312708031067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005173828254696436, + "kl": 0.0123748779296875, + "learning_rate": 1.5653460167034266e-05, + "loss": 0.0005, + "num_tokens": 1001556690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3782538192370061, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003341519129387645, + "kl": 0.0125579833984375, + "learning_rate": 1.5648544779697773e-05, + "loss": 0.0005, + "num_tokens": 1002125058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3784245113937015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034737030466355627, + "kl": 0.0126953125, + "learning_rate": 1.5643627387331783e-05, + "loss": 0.0005, + "num_tokens": 1002690498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3785952035503969, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006806624137087991, + "kl": 0.0135345458984375, + "learning_rate": 1.5638707991681796e-05, + "loss": 0.0005, + "num_tokens": 1003255170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37876589570709224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000813794878677852, + "kl": 0.0131072998046875, + "learning_rate": 1.5633786594494015e-05, + "loss": 0.0005, + "num_tokens": 1003821250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37893658786378764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017827630870062379, + "kl": 0.0129241943359375, + "learning_rate": 1.5628863197515365e-05, + "loss": 0.0005, + "num_tokens": 1004389490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37910728002048305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001508133857605258, + "kl": 0.0125274658203125, + "learning_rate": 1.5623937802493467e-05, + "loss": 0.0005, + "num_tokens": 1004954290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37927797217717846, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.65219350126766e-05, + "kl": 0.0123443603515625, + "learning_rate": 1.5619010411176662e-05, + "loss": 0.0005, + "num_tokens": 1005521490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37944866433387386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014296368221186016, + "kl": 0.0127105712890625, + "learning_rate": 1.5614081025313998e-05, + "loss": 0.0005, + "num_tokens": 1006086274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37961935649056927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021486244463729271, + "kl": 0.0127105712890625, + "learning_rate": 1.5609149646655225e-05, + "loss": 0.0005, + "num_tokens": 1006651970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3797900486472647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012724594842187886, + "kl": 0.0131072998046875, + "learning_rate": 1.5604216276950804e-05, + "loss": 0.0005, + "num_tokens": 1007217170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3799607408039601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003745513339190464, + "kl": 0.0123748779296875, + "learning_rate": 1.5599280917951906e-05, + "loss": 0.0005, + "num_tokens": 1007783170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38013143296065544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004605993492410621, + "kl": 0.0123748779296875, + "learning_rate": 1.5594343571410407e-05, + "loss": 0.0005, + "num_tokens": 1008345458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38030212511735084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013668988338825077, + "kl": 0.012298583984375, + "learning_rate": 1.5589404239078882e-05, + "loss": 0.0005, + "num_tokens": 1008904626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38047281727404625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023018800514871965, + "kl": 0.0125732421875, + "learning_rate": 1.558446292271062e-05, + "loss": 0.0005, + "num_tokens": 1009470690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38064350943074166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023460227674023036, + "kl": 0.01336669921875, + "learning_rate": 1.5579519624059605e-05, + "loss": 0.0005, + "num_tokens": 1010035458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38081420158743706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002970691820374743, + "kl": 0.012054443359375, + "learning_rate": 1.5574574344880534e-05, + "loss": 0.0005, + "num_tokens": 1010601906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38098489374413247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018871526811280341, + "kl": 0.0122833251953125, + "learning_rate": 1.5569627086928804e-05, + "loss": 0.0005, + "num_tokens": 1011164066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3811555859008279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005326429008264381, + "kl": 0.0128021240234375, + "learning_rate": 1.5564677851960504e-05, + "loss": 0.0005, + "num_tokens": 1011734818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3813262780575233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004540129332528353, + "kl": 0.012603759765625, + "learning_rate": 1.5559726641732446e-05, + "loss": 0.0005, + "num_tokens": 1012306722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38149697021421863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010247089546365273, + "kl": 0.0123443603515625, + "learning_rate": 1.5554773458002124e-05, + "loss": 0.0005, + "num_tokens": 1012869762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38166766237091404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001542733976368728, + "kl": 0.0127410888671875, + "learning_rate": 1.5549818302527738e-05, + "loss": 0.0005, + "num_tokens": 1013434098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38183835452760945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000279566255866413, + "kl": 0.012603759765625, + "learning_rate": 1.5544861177068193e-05, + "loss": 0.0005, + "num_tokens": 1014000322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38200904668430485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019933210307377405, + "kl": 0.012420654296875, + "learning_rate": 1.5539902083383088e-05, + "loss": 0.0005, + "num_tokens": 1014562658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38217973884100026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028989282389870207, + "kl": 0.0127105712890625, + "learning_rate": 1.5534941023232723e-05, + "loss": 0.0005, + "num_tokens": 1015128114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38235043099769567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017479873385582386, + "kl": 0.012908935546875, + "learning_rate": 1.5529977998378093e-05, + "loss": 0.0005, + "num_tokens": 1015693346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3825211231543911, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003246228661740102, + "kl": 0.0122833251953125, + "learning_rate": 1.5525013010580895e-05, + "loss": 0.0005, + "num_tokens": 1016260578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3826918153110865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002962448778601606, + "kl": 0.01165771484375, + "learning_rate": 1.552004606160352e-05, + "loss": 0.0005, + "num_tokens": 1016829330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38286250746778183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002658741425767036, + "kl": 0.012237548828125, + "learning_rate": 1.5515077153209053e-05, + "loss": 0.0005, + "num_tokens": 1017390194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38303319962447724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036296996397204595, + "kl": 0.012481689453125, + "learning_rate": 1.551010628716128e-05, + "loss": 0.0005, + "num_tokens": 1017954754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38320389178117265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006715567099969298, + "kl": 0.0129852294921875, + "learning_rate": 1.5505133465224683e-05, + "loss": 0.0005, + "num_tokens": 1018520578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38337458393786805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012819836199002354, + "kl": 0.012451171875, + "learning_rate": 1.5500158689164427e-05, + "loss": 0.0005, + "num_tokens": 1019087874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38354527609456346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004243076642218482, + "kl": 0.0124664306640625, + "learning_rate": 1.5495181960746377e-05, + "loss": 0.0005, + "num_tokens": 1019649538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38371596825125887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001400005252882339, + "kl": 0.012481689453125, + "learning_rate": 1.54902032817371e-05, + "loss": 0.0005, + "num_tokens": 1020212258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38388666040795427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003499156638109042, + "kl": 0.0124359130859375, + "learning_rate": 1.548522265390384e-05, + "loss": 0.0005, + "num_tokens": 1020777586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3840573525646497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010981853770676761, + "kl": 0.0130615234375, + "learning_rate": 1.5480240079014544e-05, + "loss": 0.0005, + "num_tokens": 1021344034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38422804472134503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017031536268887105, + "kl": 0.0122222900390625, + "learning_rate": 1.547525555883785e-05, + "loss": 0.0005, + "num_tokens": 1021903186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38439873687804044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005224352909640622, + "kl": 0.0124969482421875, + "learning_rate": 1.5470269095143072e-05, + "loss": 0.0005, + "num_tokens": 1022467634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38456942903473584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008638264439615932, + "kl": 0.0126190185546875, + "learning_rate": 1.5465280689700234e-05, + "loss": 0.0005, + "num_tokens": 1023034898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38474012119143125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006916930603662205, + "kl": 0.012969970703125, + "learning_rate": 1.5460290344280036e-05, + "loss": 0.0005, + "num_tokens": 1023603730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38491081334812666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022250096424467357, + "kl": 0.01263427734375, + "learning_rate": 1.5455298060653866e-05, + "loss": 0.0005, + "num_tokens": 1024171650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38508150550482206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028534855977590374, + "kl": 0.0124664306640625, + "learning_rate": 1.5450303840593815e-05, + "loss": 0.0005, + "num_tokens": 1024734738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38525219766151747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044134141364814063, + "kl": 0.011932373046875, + "learning_rate": 1.5445307685872646e-05, + "loss": 0.0005, + "num_tokens": 1025298066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3854228898182129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027795043284338267, + "kl": 0.01226806640625, + "learning_rate": 1.544030959826381e-05, + "loss": 0.0005, + "num_tokens": 1025858082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38559358197490823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024315716030045445, + "kl": 0.0121917724609375, + "learning_rate": 1.543530957954145e-05, + "loss": 0.0005, + "num_tokens": 1026429714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38576427413160363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022257606283876286, + "kl": 0.0123443603515625, + "learning_rate": 1.5430307631480395e-05, + "loss": 0.0005, + "num_tokens": 1026999602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38593496628829904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019515534988492948, + "kl": 0.0125885009765625, + "learning_rate": 1.542530375585615e-05, + "loss": 0.0005, + "num_tokens": 1027561026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38610565844499445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014404598113812147, + "kl": 0.0125274658203125, + "learning_rate": 1.5420297954444918e-05, + "loss": 0.0005, + "num_tokens": 1028130034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38627635060168986, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.77583262929657e-05, + "kl": 0.0132598876953125, + "learning_rate": 1.541529022902357e-05, + "loss": 0.0005, + "num_tokens": 1028696914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38644704275838526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003264434758482446, + "kl": 0.012237548828125, + "learning_rate": 1.5410280581369675e-05, + "loss": 0.0005, + "num_tokens": 1029260386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38661773491508067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030950489073431036, + "kl": 0.012054443359375, + "learning_rate": 1.540526901326147e-05, + "loss": 0.0005, + "num_tokens": 1029828482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3867884270717761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002701493807842615, + "kl": 0.011993408203125, + "learning_rate": 1.540025552647789e-05, + "loss": 0.0005, + "num_tokens": 1030401506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3869591192284714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002488121873170291, + "kl": 0.012481689453125, + "learning_rate": 1.5395240122798528e-05, + "loss": 0.0005, + "num_tokens": 1030964850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38712981138516683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017768351325236605, + "kl": 0.0126495361328125, + "learning_rate": 1.5390222804003688e-05, + "loss": 0.0005, + "num_tokens": 1031535122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38730050354186224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045129039039947845, + "kl": 0.0127410888671875, + "learning_rate": 1.5385203571874323e-05, + "loss": 0.0005, + "num_tokens": 1032106690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38747119569855765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005323883108716617, + "kl": 0.0126190185546875, + "learning_rate": 1.5380182428192085e-05, + "loss": 0.0005, + "num_tokens": 1032673986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38764188785525305, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.669271051887287e-05, + "kl": 0.01226806640625, + "learning_rate": 1.5375159374739298e-05, + "loss": 0.0005, + "num_tokens": 1033245922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38781258001194846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001908507656238363, + "kl": 0.012847900390625, + "learning_rate": 1.537013441329897e-05, + "loss": 0.0005, + "num_tokens": 1033811746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38798327216864387, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.192837590544394e-05, + "kl": 0.0123138427734375, + "learning_rate": 1.536510754565477e-05, + "loss": 0.0005, + "num_tokens": 1034378130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3881539643253393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022552026197679716, + "kl": 0.01226806640625, + "learning_rate": 1.5360078773591066e-05, + "loss": 0.0005, + "num_tokens": 1034945778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3883246564820346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006919855138393226, + "kl": 0.012451171875, + "learning_rate": 1.535504809889288e-05, + "loss": 0.0005, + "num_tokens": 1035515474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38849534863873003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004534550692226751, + "kl": 0.012237548828125, + "learning_rate": 1.535001552334593e-05, + "loss": 0.0005, + "num_tokens": 1036075906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38866604079542544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0192269887868806, + "kl": 0.01434326171875, + "learning_rate": 1.534498104873659e-05, + "loss": 0.0006, + "num_tokens": 1036638962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38883673295212084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005739226553007667, + "kl": 0.0120391845703125, + "learning_rate": 1.533994467685192e-05, + "loss": 0.0005, + "num_tokens": 1037200642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38900742510881625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020078564662936576, + "kl": 0.0126190185546875, + "learning_rate": 1.533490640947965e-05, + "loss": 0.0005, + "num_tokens": 1037768610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38917811726551166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033591996366524597, + "kl": 0.0123291015625, + "learning_rate": 1.5329866248408184e-05, + "loss": 0.0005, + "num_tokens": 1038332786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38934880942220707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035246348695973443, + "kl": 0.0124664306640625, + "learning_rate": 1.5324824195426603e-05, + "loss": 0.0005, + "num_tokens": 1038901522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38951950157890247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005108971161071126, + "kl": 0.013397216796875, + "learning_rate": 1.5319780252324644e-05, + "loss": 0.0005, + "num_tokens": 1039462882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3896901937355978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046274500292201563, + "kl": 0.0126190185546875, + "learning_rate": 1.5314734420892725e-05, + "loss": 0.0005, + "num_tokens": 1040027154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38986088589229323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002222893350134155, + "kl": 0.0123443603515625, + "learning_rate": 1.5309686702921938e-05, + "loss": 0.0005, + "num_tokens": 1040588706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39003157804898864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024872684536770916, + "kl": 0.0124664306640625, + "learning_rate": 1.5304637100204045e-05, + "loss": 0.0005, + "num_tokens": 1041156546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39020227020568404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021327870652558181, + "kl": 0.0133209228515625, + "learning_rate": 1.5299585614531464e-05, + "loss": 0.0005, + "num_tokens": 1041732818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39037296236237945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044086448239125136, + "kl": 0.012664794921875, + "learning_rate": 1.5294532247697297e-05, + "loss": 0.0005, + "num_tokens": 1042304994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39054365451907486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004900080096953925, + "kl": 0.0127410888671875, + "learning_rate": 1.52894770014953e-05, + "loss": 0.0005, + "num_tokens": 1042877410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39071434667577026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007365575636154018, + "kl": 0.0128021240234375, + "learning_rate": 1.5284419877719908e-05, + "loss": 0.0005, + "num_tokens": 1043440162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39088503883246567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001717527604175061, + "kl": 0.012420654296875, + "learning_rate": 1.5279360878166218e-05, + "loss": 0.0005, + "num_tokens": 1044009938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.391055730989161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019800487427728536, + "kl": 0.0130767822265625, + "learning_rate": 1.527430000462999e-05, + "loss": 0.0005, + "num_tokens": 1044585746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39122642314585643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009037552409735593, + "kl": 0.0128173828125, + "learning_rate": 1.526923725890765e-05, + "loss": 0.0005, + "num_tokens": 1045150082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39139711530255183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003500469658377072, + "kl": 0.012237548828125, + "learning_rate": 1.526417264279629e-05, + "loss": 0.0005, + "num_tokens": 1045712194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39156780745924724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006706768689038696, + "kl": 0.0123291015625, + "learning_rate": 1.525910615809367e-05, + "loss": 0.0005, + "num_tokens": 1046276450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39173849961594265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007388533635757456, + "kl": 0.0131072998046875, + "learning_rate": 1.5254037806598208e-05, + "loss": 0.0005, + "num_tokens": 1046842626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39190919177263805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003139300205039793, + "kl": 0.0125274658203125, + "learning_rate": 1.5248967590108983e-05, + "loss": 0.0005, + "num_tokens": 1047411746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39207988392933346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002638648620346723, + "kl": 0.0122833251953125, + "learning_rate": 1.524389551042574e-05, + "loss": 0.0005, + "num_tokens": 1047976274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39225057608602887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007325022581238998, + "kl": 0.01300048828125, + "learning_rate": 1.5238821569348882e-05, + "loss": 0.0005, + "num_tokens": 1048543346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3924212682427242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016540774077724948, + "kl": 0.012908935546875, + "learning_rate": 1.523374576867948e-05, + "loss": 0.0005, + "num_tokens": 1049107554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3925919603994196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003831161639205319, + "kl": 0.01239013671875, + "learning_rate": 1.5228668110219257e-05, + "loss": 0.0005, + "num_tokens": 1049670450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39276265255611503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043568287293598864, + "kl": 0.012481689453125, + "learning_rate": 1.52235885957706e-05, + "loss": 0.0005, + "num_tokens": 1050236018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39293334471281044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013750647133545309, + "kl": 0.0129852294921875, + "learning_rate": 1.5218507227136555e-05, + "loss": 0.0005, + "num_tokens": 1050796146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39310403686950585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028276480243243254, + "kl": 0.012939453125, + "learning_rate": 1.5213424006120816e-05, + "loss": 0.0005, + "num_tokens": 1051358402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39327472902620125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006231496347699832, + "kl": 0.0136566162109375, + "learning_rate": 1.5208338934527751e-05, + "loss": 0.0005, + "num_tokens": 1051933810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39344542118289666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023292111303246815, + "kl": 0.0122222900390625, + "learning_rate": 1.5203252014162373e-05, + "loss": 0.0005, + "num_tokens": 1052506562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39361611333959207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005015471960903903, + "kl": 0.012237548828125, + "learning_rate": 1.5198163246830361e-05, + "loss": 0.0005, + "num_tokens": 1053070514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3937868054962875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005802549090974999, + "kl": 0.012786865234375, + "learning_rate": 1.5193072634338033e-05, + "loss": 0.0005, + "num_tokens": 1053629874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3939574976529828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005739662569236421, + "kl": 0.012298583984375, + "learning_rate": 1.5187980178492384e-05, + "loss": 0.0005, + "num_tokens": 1054193010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39412818980967823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036421235257101567, + "kl": 0.0123443603515625, + "learning_rate": 1.5182885881101043e-05, + "loss": 0.0005, + "num_tokens": 1054760562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39429888196637364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011582008997672061, + "kl": 0.01318359375, + "learning_rate": 1.5177789743972311e-05, + "loss": 0.0005, + "num_tokens": 1055323266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39446957412306904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044441924178994585, + "kl": 0.0134124755859375, + "learning_rate": 1.5172691768915124e-05, + "loss": 0.0005, + "num_tokens": 1055890370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39464026627976445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014735915107398577, + "kl": 0.0130615234375, + "learning_rate": 1.5167591957739083e-05, + "loss": 0.0005, + "num_tokens": 1056458034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39481095843645986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005287184926615495, + "kl": 0.0120391845703125, + "learning_rate": 1.5162490312254436e-05, + "loss": 0.0005, + "num_tokens": 1057038562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39498165059315526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007713990639307942, + "kl": 0.0125732421875, + "learning_rate": 1.5157386834272082e-05, + "loss": 0.0005, + "num_tokens": 1057603666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39515234274985067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005014725975887516, + "kl": 0.0124053955078125, + "learning_rate": 1.5152281525603577e-05, + "loss": 0.0005, + "num_tokens": 1058166306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.395323034906546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003618256546729643, + "kl": 0.0128631591796875, + "learning_rate": 1.5147174388061113e-05, + "loss": 0.0005, + "num_tokens": 1058733314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39549372706324143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004114310313640004, + "kl": 0.0129852294921875, + "learning_rate": 1.5142065423457547e-05, + "loss": 0.0005, + "num_tokens": 1059294834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39566441921993684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005752664037369754, + "kl": 0.013336181640625, + "learning_rate": 1.5136954633606369e-05, + "loss": 0.0005, + "num_tokens": 1059860754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39583511137663224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004700000471785599, + "kl": 0.012481689453125, + "learning_rate": 1.5131842020321733e-05, + "loss": 0.0005, + "num_tokens": 1060425826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39600580353332765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022354754476437073, + "kl": 0.01275634765625, + "learning_rate": 1.5126727585418428e-05, + "loss": 0.0005, + "num_tokens": 1061000754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39617649569002306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008183938698674059, + "kl": 0.01214599609375, + "learning_rate": 1.5121611330711894e-05, + "loss": 0.0005, + "num_tokens": 1061562402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39634718784671846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020379236632492645, + "kl": 0.012603759765625, + "learning_rate": 1.5116493258018219e-05, + "loss": 0.0005, + "num_tokens": 1062126978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39651788000341387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014815439116379128, + "kl": 0.0123291015625, + "learning_rate": 1.5111373369154128e-05, + "loss": 0.0005, + "num_tokens": 1062692642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3966885721601092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001322823031482675, + "kl": 0.0124969482421875, + "learning_rate": 1.5106251665937004e-05, + "loss": 0.0005, + "num_tokens": 1063263282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3968592643168046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005503093156710395, + "kl": 0.013092041015625, + "learning_rate": 1.5101128150184866e-05, + "loss": 0.0005, + "num_tokens": 1063829330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39702995647350003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032574396152171715, + "kl": 0.012054443359375, + "learning_rate": 1.509600282371637e-05, + "loss": 0.0005, + "num_tokens": 1064392994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39720064863019544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000502687416027001, + "kl": 0.012359619140625, + "learning_rate": 1.5090875688350828e-05, + "loss": 0.0005, + "num_tokens": 1064958434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39737134078689085, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.256044005959599e-05, + "kl": 0.0119781494140625, + "learning_rate": 1.5085746745908187e-05, + "loss": 0.0005, + "num_tokens": 1065524690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39754203294358625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004302519514074684, + "kl": 0.0124969482421875, + "learning_rate": 1.5080615998209033e-05, + "loss": 0.0005, + "num_tokens": 1066087090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39771272510028166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001489950296998756, + "kl": 0.01361083984375, + "learning_rate": 1.5075483447074608e-05, + "loss": 0.0005, + "num_tokens": 1066647794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39788341725697707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000315304335542307, + "kl": 0.012451171875, + "learning_rate": 1.5070349094326766e-05, + "loss": 0.0005, + "num_tokens": 1067214578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3980541094136724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006646314798288735, + "kl": 0.01251220703125, + "learning_rate": 1.506521294178803e-05, + "loss": 0.0005, + "num_tokens": 1067777890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3982248015703678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033173704875269277, + "kl": 0.012786865234375, + "learning_rate": 1.5060074991281537e-05, + "loss": 0.0005, + "num_tokens": 1068345714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39839549372706323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005713959023133317, + "kl": 0.012939453125, + "learning_rate": 1.5054935244631085e-05, + "loss": 0.0005, + "num_tokens": 1068910066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39856618588375864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005695487021561697, + "kl": 0.0125732421875, + "learning_rate": 1.5049793703661091e-05, + "loss": 0.0005, + "num_tokens": 1069471890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39873687804045405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020381608943144047, + "kl": 0.012481689453125, + "learning_rate": 1.5044650370196622e-05, + "loss": 0.0005, + "num_tokens": 1070038994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39890757019714945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002928980832414065, + "kl": 0.012725830078125, + "learning_rate": 1.5039505246063372e-05, + "loss": 0.0005, + "num_tokens": 1070605138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39907826235384486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042063952732064315, + "kl": 0.012786865234375, + "learning_rate": 1.5034358333087678e-05, + "loss": 0.0005, + "num_tokens": 1071167250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39924895451054027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014741348166969325, + "kl": 0.01287841796875, + "learning_rate": 1.50292096330965e-05, + "loss": 0.0005, + "num_tokens": 1071733778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3994196466672356, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.964630417224286e-05, + "kl": 0.012481689453125, + "learning_rate": 1.5024059147917448e-05, + "loss": 0.0005, + "num_tokens": 1072296114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.399590338823931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012920656859027855, + "kl": 0.012542724609375, + "learning_rate": 1.5018906879378761e-05, + "loss": 0.0005, + "num_tokens": 1072860546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39976103098062643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019017172787064325, + "kl": 0.0123748779296875, + "learning_rate": 1.5013752829309298e-05, + "loss": 0.0005, + "num_tokens": 1073430850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39993172313732184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019575108428161427, + "kl": 0.0128173828125, + "learning_rate": 1.500859699953857e-05, + "loss": 0.0005, + "num_tokens": 1073998514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40010241529401724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040069560920648773, + "kl": 0.0126190185546875, + "learning_rate": 1.5003439391896706e-05, + "loss": 0.0005, + "num_tokens": 1074568354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40027310745071265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017669672457112978, + "kl": 0.01348876953125, + "learning_rate": 1.4998280008214473e-05, + "loss": 0.0005, + "num_tokens": 1075137362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40044379960740806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001520353917237503, + "kl": 0.0125274658203125, + "learning_rate": 1.4993118850323265e-05, + "loss": 0.0005, + "num_tokens": 1075703874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40061449176410346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015959068867748516, + "kl": 0.0131683349609375, + "learning_rate": 1.4987955920055107e-05, + "loss": 0.0005, + "num_tokens": 1076271314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4007851839207988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017821038496433487, + "kl": 0.012237548828125, + "learning_rate": 1.4982791219242647e-05, + "loss": 0.0005, + "num_tokens": 1076830690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4009558760774942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032303201723955213, + "kl": 0.0126495361328125, + "learning_rate": 1.497762474971918e-05, + "loss": 0.0005, + "num_tokens": 1077395362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40112656823418963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022477021883026797, + "kl": 0.0128326416015625, + "learning_rate": 1.4972456513318602e-05, + "loss": 0.0005, + "num_tokens": 1077959218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40129726039088504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019612512917069387, + "kl": 0.01220703125, + "learning_rate": 1.4967286511875462e-05, + "loss": 0.0005, + "num_tokens": 1078527410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40146795254758044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016649384552718376, + "kl": 0.012939453125, + "learning_rate": 1.4962114747224916e-05, + "loss": 0.0005, + "num_tokens": 1079087634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40163864470427585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003235050171474022, + "kl": 0.01214599609375, + "learning_rate": 1.4956941221202757e-05, + "loss": 0.0005, + "num_tokens": 1079652178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40180933686097126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000236898066374418, + "kl": 0.012542724609375, + "learning_rate": 1.4951765935645401e-05, + "loss": 0.0005, + "num_tokens": 1080216258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40198002901766666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012471712141542035, + "kl": 0.0126190185546875, + "learning_rate": 1.4946588892389887e-05, + "loss": 0.0005, + "num_tokens": 1080783490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.402150721174362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000352181038762054, + "kl": 0.012237548828125, + "learning_rate": 1.4941410093273875e-05, + "loss": 0.0005, + "num_tokens": 1081350626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4023214133310574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005291886060715648, + "kl": 0.012786865234375, + "learning_rate": 1.4936229540135659e-05, + "loss": 0.0005, + "num_tokens": 1081913394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4024921054877528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004924829573867657, + "kl": 0.01263427734375, + "learning_rate": 1.493104723481414e-05, + "loss": 0.0005, + "num_tokens": 1082480338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40266279764444823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006334857967865122, + "kl": 0.0123443603515625, + "learning_rate": 1.4925863179148853e-05, + "loss": 0.0005, + "num_tokens": 1083046130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40283348980114364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003897739116539795, + "kl": 0.0122833251953125, + "learning_rate": 1.4920677374979952e-05, + "loss": 0.0005, + "num_tokens": 1083612818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40300418195783905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004997091110362605, + "kl": 0.01214599609375, + "learning_rate": 1.491548982414821e-05, + "loss": 0.0005, + "num_tokens": 1084175442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40317487411453445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002684759140704534, + "kl": 0.0121612548828125, + "learning_rate": 1.491030052849502e-05, + "loss": 0.0005, + "num_tokens": 1084736290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40334556627122986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019284966300493368, + "kl": 0.0133209228515625, + "learning_rate": 1.4905109489862393e-05, + "loss": 0.0005, + "num_tokens": 1085301410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4035162584279252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000342353066268509, + "kl": 0.012298583984375, + "learning_rate": 1.4899916710092961e-05, + "loss": 0.0005, + "num_tokens": 1085867298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4036869505846206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005481464210123317, + "kl": 0.012176513671875, + "learning_rate": 1.4894722191029974e-05, + "loss": 0.0005, + "num_tokens": 1086434578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.403857642741316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012585337131774139, + "kl": 0.01239013671875, + "learning_rate": 1.4889525934517299e-05, + "loss": 0.0005, + "num_tokens": 1087000898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40402833489801143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001912263534150029, + "kl": 0.012664794921875, + "learning_rate": 1.4884327942399419e-05, + "loss": 0.0005, + "num_tokens": 1087563186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40419902705470684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005309538734643934, + "kl": 0.012939453125, + "learning_rate": 1.4879128216521433e-05, + "loss": 0.0005, + "num_tokens": 1088134466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40436971921140225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018979178152734292, + "kl": 0.0127105712890625, + "learning_rate": 1.4873926758729056e-05, + "loss": 0.0005, + "num_tokens": 1088707538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40454041136809765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036208232900203184, + "kl": 0.0127105712890625, + "learning_rate": 1.4868723570868619e-05, + "loss": 0.0005, + "num_tokens": 1089274146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40471110352479306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001347103607004483, + "kl": 0.0127410888671875, + "learning_rate": 1.4863518654787067e-05, + "loss": 0.0005, + "num_tokens": 1089836578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4048817956814884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004253488508301947, + "kl": 0.012542724609375, + "learning_rate": 1.485831201233195e-05, + "loss": 0.0005, + "num_tokens": 1090407346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4050524878381838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020050291759468307, + "kl": 0.0130615234375, + "learning_rate": 1.485310364535145e-05, + "loss": 0.0005, + "num_tokens": 1090977938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4052231799948792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002440901605830682, + "kl": 0.0124969482421875, + "learning_rate": 1.4847893555694337e-05, + "loss": 0.0005, + "num_tokens": 1091541058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40539387215157463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040545004770977864, + "kl": 0.0125885009765625, + "learning_rate": 1.4842681745210016e-05, + "loss": 0.0005, + "num_tokens": 1092106322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40556456430827004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030123056717833304, + "kl": 0.0122833251953125, + "learning_rate": 1.4837468215748483e-05, + "loss": 0.0005, + "num_tokens": 1092667922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40573525646496544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003355183594048335, + "kl": 0.012542724609375, + "learning_rate": 1.4832252969160359e-05, + "loss": 0.0005, + "num_tokens": 1093237362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40590594862166085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004332848754051674, + "kl": 0.0126953125, + "learning_rate": 1.482703600729686e-05, + "loss": 0.0005, + "num_tokens": 1093801602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40607664077835626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026375061773556885, + "kl": 0.01300048828125, + "learning_rate": 1.482181733200983e-05, + "loss": 0.0005, + "num_tokens": 1094363778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4062473329350516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002146043410330507, + "kl": 0.0131378173828125, + "learning_rate": 1.48165969451517e-05, + "loss": 0.0005, + "num_tokens": 1094924642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.406418025091747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007758754030799189, + "kl": 0.013031005859375, + "learning_rate": 1.4811374848575529e-05, + "loss": 0.0005, + "num_tokens": 1095489810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4065887172484424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023179603517839773, + "kl": 0.0121612548828125, + "learning_rate": 1.4806151044134962e-05, + "loss": 0.0005, + "num_tokens": 1096049090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40675940940513783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031728963686249883, + "kl": 0.013031005859375, + "learning_rate": 1.480092553368427e-05, + "loss": 0.0005, + "num_tokens": 1096610258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40693010156183324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012659198781509143, + "kl": 0.0127410888671875, + "learning_rate": 1.4795698319078315e-05, + "loss": 0.0005, + "num_tokens": 1097177266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40710079371852864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037379461880064754, + "kl": 0.012115478515625, + "learning_rate": 1.4790469402172574e-05, + "loss": 0.0005, + "num_tokens": 1097745842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40727148587522405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004776793137819435, + "kl": 0.0125732421875, + "learning_rate": 1.478523878482312e-05, + "loss": 0.0005, + "num_tokens": 1098305266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40744217803191946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004857469101651428, + "kl": 0.0126800537109375, + "learning_rate": 1.4780006468886632e-05, + "loss": 0.0005, + "num_tokens": 1098867106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4076128701886148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010414031330683983, + "kl": 0.0128326416015625, + "learning_rate": 1.4774772456220398e-05, + "loss": 0.0005, + "num_tokens": 1099432130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4077835623453102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002454284813203226, + "kl": 0.01239013671875, + "learning_rate": 1.4769536748682303e-05, + "loss": 0.0005, + "num_tokens": 1100007026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4079542545020056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002876018162874635, + "kl": 0.0125274658203125, + "learning_rate": 1.476429934813083e-05, + "loss": 0.0005, + "num_tokens": 1100569602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.408124946658701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010632464354948803, + "kl": 0.012542724609375, + "learning_rate": 1.475906025642507e-05, + "loss": 0.0005, + "num_tokens": 1101138802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40829563881539643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025588581089183086, + "kl": 0.012115478515625, + "learning_rate": 1.4753819475424714e-05, + "loss": 0.0005, + "num_tokens": 1101701010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40846633097209184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004982054917503193, + "kl": 0.0120391845703125, + "learning_rate": 1.4748577006990043e-05, + "loss": 0.0005, + "num_tokens": 1102272786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40863702312878725, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.746708752078894e-05, + "kl": 0.0125885009765625, + "learning_rate": 1.474333285298195e-05, + "loss": 0.0005, + "num_tokens": 1102840418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40880771528548265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010760793160949822, + "kl": 0.01287841796875, + "learning_rate": 1.473808701526192e-05, + "loss": 0.0005, + "num_tokens": 1103406530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.408978407442178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001591666131162618, + "kl": 0.0122222900390625, + "learning_rate": 1.4732839495692035e-05, + "loss": 0.0005, + "num_tokens": 1103970290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4091490995988734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038180811175617856, + "kl": 0.0128326416015625, + "learning_rate": 1.4727590296134975e-05, + "loss": 0.0005, + "num_tokens": 1104529922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4093197917555688, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.32728567185015e-05, + "kl": 0.0123443603515625, + "learning_rate": 1.4722339418454015e-05, + "loss": 0.0005, + "num_tokens": 1105093954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4094904839122642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011927222837235607, + "kl": 0.01214599609375, + "learning_rate": 1.4717086864513026e-05, + "loss": 0.0005, + "num_tokens": 1105661730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40966117606895963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001281155786900896, + "kl": 0.0123138427734375, + "learning_rate": 1.471183263617648e-05, + "loss": 0.0005, + "num_tokens": 1106228322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.40983186822565504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003341191654897432, + "kl": 0.01226806640625, + "learning_rate": 1.4706576735309435e-05, + "loss": 0.0005, + "num_tokens": 1106792914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41000256038235044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042284104133445715, + "kl": 0.0128326416015625, + "learning_rate": 1.4701319163777544e-05, + "loss": 0.0005, + "num_tokens": 1107356706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41017325253904585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020747590330400474, + "kl": 0.01251220703125, + "learning_rate": 1.4696059923447062e-05, + "loss": 0.0005, + "num_tokens": 1107921906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41034394469574126, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.346255628798929e-05, + "kl": 0.012451171875, + "learning_rate": 1.469079901618482e-05, + "loss": 0.0005, + "num_tokens": 1108487202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4105146368524366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018940417300538474, + "kl": 0.012664794921875, + "learning_rate": 1.4685536443858262e-05, + "loss": 0.0005, + "num_tokens": 1109054722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.410685329009132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002985121324166361, + "kl": 0.0124359130859375, + "learning_rate": 1.4680272208335398e-05, + "loss": 0.0005, + "num_tokens": 1109622690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4108560211658274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003635858093116654, + "kl": 0.01251220703125, + "learning_rate": 1.4675006311484854e-05, + "loss": 0.0005, + "num_tokens": 1110186914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41102671332252283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024782385048845607, + "kl": 0.012969970703125, + "learning_rate": 1.4669738755175822e-05, + "loss": 0.0005, + "num_tokens": 1110755538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41119740547921824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017040420774782328, + "kl": 0.0124664306640625, + "learning_rate": 1.4664469541278106e-05, + "loss": 0.0005, + "num_tokens": 1111317378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41136809763591364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001852883424791572, + "kl": 0.0125579833984375, + "learning_rate": 1.4659198671662073e-05, + "loss": 0.0005, + "num_tokens": 1111888050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41153878979260905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013686666570781954, + "kl": 0.012298583984375, + "learning_rate": 1.4653926148198705e-05, + "loss": 0.0005, + "num_tokens": 1112453058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41170948194930446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003682753559693015, + "kl": 0.013092041015625, + "learning_rate": 1.4648651972759549e-05, + "loss": 0.0005, + "num_tokens": 1113018914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4118801741059998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002306255330657189, + "kl": 0.0125579833984375, + "learning_rate": 1.4643376147216751e-05, + "loss": 0.0005, + "num_tokens": 1113581682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4120508662626952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000500873311809878, + "kl": 0.012664794921875, + "learning_rate": 1.4638098673443036e-05, + "loss": 0.0005, + "num_tokens": 1114151890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4122215584193906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000720581048006358, + "kl": 0.0129241943359375, + "learning_rate": 1.4632819553311715e-05, + "loss": 0.0005, + "num_tokens": 1114724498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41239225057608603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014611618138718593, + "kl": 0.0124664306640625, + "learning_rate": 1.4627538788696691e-05, + "loss": 0.0005, + "num_tokens": 1115286258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41256294273278143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029006187515906185, + "kl": 0.0121917724609375, + "learning_rate": 1.4622256381472442e-05, + "loss": 0.0005, + "num_tokens": 1115851986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41273363488947684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003263328353188619, + "kl": 0.012420654296875, + "learning_rate": 1.4616972333514028e-05, + "loss": 0.0005, + "num_tokens": 1116417106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41290432704617225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007498815921134277, + "kl": 0.0123443603515625, + "learning_rate": 1.4611686646697103e-05, + "loss": 0.0005, + "num_tokens": 1116979426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41307501920286765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018711210059345028, + "kl": 0.011932373046875, + "learning_rate": 1.4606399322897889e-05, + "loss": 0.0005, + "num_tokens": 1117541362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.413245711359563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002481288676354954, + "kl": 0.01318359375, + "learning_rate": 1.4601110363993196e-05, + "loss": 0.0005, + "num_tokens": 1118109762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4134164035162584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012537656347461223, + "kl": 0.0122528076171875, + "learning_rate": 1.4595819771860415e-05, + "loss": 0.0005, + "num_tokens": 1118688258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4135870956729538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008126776671190537, + "kl": 0.012115478515625, + "learning_rate": 1.4590527548377513e-05, + "loss": 0.0005, + "num_tokens": 1119254226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4137577878296492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019106405706398674, + "kl": 0.0131988525390625, + "learning_rate": 1.4585233695423043e-05, + "loss": 0.0005, + "num_tokens": 1119820818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41392847998634463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028209750846264372, + "kl": 0.012481689453125, + "learning_rate": 1.4579938214876126e-05, + "loss": 0.0005, + "num_tokens": 1120389842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41409917214304004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002884811096638634, + "kl": 0.0128936767578125, + "learning_rate": 1.457464110861647e-05, + "loss": 0.0005, + "num_tokens": 1120954434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41426986429973545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042212200722537235, + "kl": 0.012298583984375, + "learning_rate": 1.4569342378524356e-05, + "loss": 0.0005, + "num_tokens": 1121520322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41444055645643085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004740975890937528, + "kl": 0.0123138427734375, + "learning_rate": 1.456404202648064e-05, + "loss": 0.0005, + "num_tokens": 1122083986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4146112486131262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012771818782588606, + "kl": 0.0121612548828125, + "learning_rate": 1.4558740054366758e-05, + "loss": 0.0005, + "num_tokens": 1122650562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4147819407698216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003793705420451096, + "kl": 0.0129241943359375, + "learning_rate": 1.4553436464064719e-05, + "loss": 0.0005, + "num_tokens": 1123210066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.414952632926517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028542009657969844, + "kl": 0.01239013671875, + "learning_rate": 1.4548131257457103e-05, + "loss": 0.0005, + "num_tokens": 1123779970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4151233250832124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015473804852496505, + "kl": 0.01214599609375, + "learning_rate": 1.4542824436427068e-05, + "loss": 0.0005, + "num_tokens": 1124342802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41529401723990783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035109353127373526, + "kl": 0.01300048828125, + "learning_rate": 1.4537516002858348e-05, + "loss": 0.0005, + "num_tokens": 1124906242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41546470939660324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005824935507052134, + "kl": 0.0124359130859375, + "learning_rate": 1.4532205958635237e-05, + "loss": 0.0005, + "num_tokens": 1125471042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41563540155329864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046383565668472315, + "kl": 0.012420654296875, + "learning_rate": 1.4526894305642618e-05, + "loss": 0.0005, + "num_tokens": 1126035250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41580609370999405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006983689458355945, + "kl": 0.0126495361328125, + "learning_rate": 1.4521581045765928e-05, + "loss": 0.0005, + "num_tokens": 1126610322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4159767858666894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002499458797912055, + "kl": 0.011993408203125, + "learning_rate": 1.4516266180891191e-05, + "loss": 0.0005, + "num_tokens": 1127183858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4161474780233848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003409127402063838, + "kl": 0.012603759765625, + "learning_rate": 1.4510949712904985e-05, + "loss": 0.0005, + "num_tokens": 1127750418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4163181701800802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044497616645078613, + "kl": 0.0122528076171875, + "learning_rate": 1.4505631643694469e-05, + "loss": 0.0005, + "num_tokens": 1128317026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4164888623367756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003781891354028994, + "kl": 0.0124053955078125, + "learning_rate": 1.450031197514736e-05, + "loss": 0.0005, + "num_tokens": 1128883282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41665955449347103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006382505973805474, + "kl": 0.012298583984375, + "learning_rate": 1.4494990709151957e-05, + "loss": 0.0005, + "num_tokens": 1129448290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41683024665016644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003540343789509278, + "kl": 0.0118865966796875, + "learning_rate": 1.4489667847597106e-05, + "loss": 0.0005, + "num_tokens": 1130014530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41700093880686184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029163116632890897, + "kl": 0.0124969482421875, + "learning_rate": 1.4484343392372242e-05, + "loss": 0.0005, + "num_tokens": 1130579682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41717163096355725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008641327043886507, + "kl": 0.0121917724609375, + "learning_rate": 1.4479017345367345e-05, + "loss": 0.0005, + "num_tokens": 1131143522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4173423231202526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003413575369318945, + "kl": 0.012603759765625, + "learning_rate": 1.4473689708472975e-05, + "loss": 0.0005, + "num_tokens": 1131712802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.417513015276948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001999206136083471, + "kl": 0.01318359375, + "learning_rate": 1.446836048358025e-05, + "loss": 0.0005, + "num_tokens": 1132277506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4176837074336434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003552574120986223, + "kl": 0.012603759765625, + "learning_rate": 1.4463029672580853e-05, + "loss": 0.0005, + "num_tokens": 1132841522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4178543995903388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003383846622511349, + "kl": 0.012603759765625, + "learning_rate": 1.4457697277367026e-05, + "loss": 0.0005, + "num_tokens": 1133404514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4180250917470342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005948987219382194, + "kl": 0.01275634765625, + "learning_rate": 1.4452363299831582e-05, + "loss": 0.0005, + "num_tokens": 1133971090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41819578390372963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007041130259347444, + "kl": 0.0133514404296875, + "learning_rate": 1.4447027741867886e-05, + "loss": 0.0005, + "num_tokens": 1134533762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41836647606042504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011473393972639056, + "kl": 0.0128173828125, + "learning_rate": 1.4441690605369869e-05, + "loss": 0.0005, + "num_tokens": 1135102450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41853716821712045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016132521962204758, + "kl": 0.0125732421875, + "learning_rate": 1.4436351892232025e-05, + "loss": 0.0005, + "num_tokens": 1135666306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4187078603738158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004785312378196614, + "kl": 0.0123443603515625, + "learning_rate": 1.4431011604349402e-05, + "loss": 0.0005, + "num_tokens": 1136232002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4188785525305112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033060783803939045, + "kl": 0.0123291015625, + "learning_rate": 1.4425669743617609e-05, + "loss": 0.0005, + "num_tokens": 1136797874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4190492446872066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008449152640660675, + "kl": 0.0128631591796875, + "learning_rate": 1.4420326311932815e-05, + "loss": 0.0005, + "num_tokens": 1137369170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.419219936843902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032655617519695767, + "kl": 0.012847900390625, + "learning_rate": 1.4414981311191744e-05, + "loss": 0.0005, + "num_tokens": 1137934290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4193906290005974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000428014427136895, + "kl": 0.0120849609375, + "learning_rate": 1.440963474329168e-05, + "loss": 0.0005, + "num_tokens": 1138501874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41956132115729283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022928135653217362, + "kl": 0.0127410888671875, + "learning_rate": 1.440428661013046e-05, + "loss": 0.0005, + "num_tokens": 1139068034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41973201331398824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031193905403720023, + "kl": 0.0130767822265625, + "learning_rate": 1.439893691360648e-05, + "loss": 0.0005, + "num_tokens": 1139640834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41990270547068365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029819779113526016, + "kl": 0.0128936767578125, + "learning_rate": 1.439358565561869e-05, + "loss": 0.0005, + "num_tokens": 1140203346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.420073397627379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013684436564495413, + "kl": 0.0127716064453125, + "learning_rate": 1.4388232838066587e-05, + "loss": 0.0005, + "num_tokens": 1140766482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4202440897840744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012968261202817927, + "kl": 0.012359619140625, + "learning_rate": 1.4382878462850233e-05, + "loss": 0.0005, + "num_tokens": 1141333570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4204147819407698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003009363593381106, + "kl": 0.0129852294921875, + "learning_rate": 1.4377522531870242e-05, + "loss": 0.0005, + "num_tokens": 1141906914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4205854740974652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001245250083355975, + "kl": 0.0126800537109375, + "learning_rate": 1.437216504702777e-05, + "loss": 0.0005, + "num_tokens": 1142478002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4207561662541606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004916980330399758, + "kl": 0.0126495361328125, + "learning_rate": 1.436680601022453e-05, + "loss": 0.0005, + "num_tokens": 1143043826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42092685841085603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012642184923152, + "kl": 0.0120849609375, + "learning_rate": 1.436144542336279e-05, + "loss": 0.0005, + "num_tokens": 1143608594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42109755056755144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010991901485370144, + "kl": 0.01300048828125, + "learning_rate": 1.4356083288345367e-05, + "loss": 0.0005, + "num_tokens": 1144173490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42126824272424684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002102602738936124, + "kl": 0.0124664306640625, + "learning_rate": 1.4350719607075616e-05, + "loss": 0.0005, + "num_tokens": 1144736434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4214389348809422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013410749336271983, + "kl": 0.0123138427734375, + "learning_rate": 1.4345354381457463e-05, + "loss": 0.0005, + "num_tokens": 1145301138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4216096270376376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037993297237565827, + "kl": 0.0126190185546875, + "learning_rate": 1.433998761339536e-05, + "loss": 0.0005, + "num_tokens": 1145874850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.421780319194333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004114790518097703, + "kl": 0.0127716064453125, + "learning_rate": 1.4334619304794317e-05, + "loss": 0.0005, + "num_tokens": 1146447602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4219510113510284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008492118983203127, + "kl": 0.0124969482421875, + "learning_rate": 1.4329249457559892e-05, + "loss": 0.0005, + "num_tokens": 1147022978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4221217035077238, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.618804512958339e-05, + "kl": 0.012420654296875, + "learning_rate": 1.432387807359819e-05, + "loss": 0.0005, + "num_tokens": 1147585570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42229239566441923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001478582875575485, + "kl": 0.0123748779296875, + "learning_rate": 1.4318505154815851e-05, + "loss": 0.0005, + "num_tokens": 1148154162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42246308782111464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01139508191680118, + "kl": 0.0137786865234375, + "learning_rate": 1.4313130703120073e-05, + "loss": 0.0006, + "num_tokens": 1148720226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42263377997781004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024413973780878666, + "kl": 0.012908935546875, + "learning_rate": 1.4307754720418592e-05, + "loss": 0.0005, + "num_tokens": 1149286946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4228044721345054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006186293169143195, + "kl": 0.0124053955078125, + "learning_rate": 1.4302377208619684e-05, + "loss": 0.0005, + "num_tokens": 1149850850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4229751642912008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020864976092246698, + "kl": 0.0130767822265625, + "learning_rate": 1.4296998169632175e-05, + "loss": 0.0005, + "num_tokens": 1150417282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4231458564478962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006394451272341508, + "kl": 0.013092041015625, + "learning_rate": 1.4291617605365427e-05, + "loss": 0.0005, + "num_tokens": 1150985634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4233165486045916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007645880762614839, + "kl": 0.0128326416015625, + "learning_rate": 1.4286235517729348e-05, + "loss": 0.0005, + "num_tokens": 1151551298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.423487240761287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034975906969544147, + "kl": 0.012420654296875, + "learning_rate": 1.4280851908634386e-05, + "loss": 0.0005, + "num_tokens": 1152113602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4236579329179824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008677974803694556, + "kl": 0.0128326416015625, + "learning_rate": 1.4275466779991526e-05, + "loss": 0.0005, + "num_tokens": 1152681938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42382862507467783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035040574780047, + "kl": 0.0126190185546875, + "learning_rate": 1.4270080133712292e-05, + "loss": 0.0005, + "num_tokens": 1153252050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42399931723137324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006275500930289793, + "kl": 0.0125885009765625, + "learning_rate": 1.4264691971708753e-05, + "loss": 0.0005, + "num_tokens": 1153823410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4241700093880686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00059736319671158, + "kl": 0.012451171875, + "learning_rate": 1.4259302295893511e-05, + "loss": 0.0005, + "num_tokens": 1154389058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.424340701544764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003611322475736129, + "kl": 0.0126800537109375, + "learning_rate": 1.4253911108179708e-05, + "loss": 0.0005, + "num_tokens": 1154954386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4245113937014594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046757940508893557, + "kl": 0.0125579833984375, + "learning_rate": 1.4248518410481016e-05, + "loss": 0.0005, + "num_tokens": 1155522082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4246820858581548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004579732244884339, + "kl": 0.0128326416015625, + "learning_rate": 1.4243124204711651e-05, + "loss": 0.0005, + "num_tokens": 1156087682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4248527780148502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017558227284717972, + "kl": 0.0128631591796875, + "learning_rate": 1.4237728492786365e-05, + "loss": 0.0005, + "num_tokens": 1156650114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4250234701715456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010209721924696883, + "kl": 0.0128021240234375, + "learning_rate": 1.4232331276620433e-05, + "loss": 0.0005, + "num_tokens": 1157218706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42519416232824103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005100460026973271, + "kl": 0.0133209228515625, + "learning_rate": 1.4226932558129677e-05, + "loss": 0.0005, + "num_tokens": 1157786674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42536485448493644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004721390355308905, + "kl": 0.0126953125, + "learning_rate": 1.4221532339230448e-05, + "loss": 0.0005, + "num_tokens": 1158360914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4255355466416318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040422373640896137, + "kl": 0.012908935546875, + "learning_rate": 1.4216130621839626e-05, + "loss": 0.0005, + "num_tokens": 1158918338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4257062387983272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000666767896966176, + "kl": 0.0127410888671875, + "learning_rate": 1.4210727407874624e-05, + "loss": 0.0005, + "num_tokens": 1159506898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4258769309550226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006891977658310012, + "kl": 0.0124969482421875, + "learning_rate": 1.4205322699253397e-05, + "loss": 0.0005, + "num_tokens": 1160071650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.426047623111718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009563949861839909, + "kl": 0.0126953125, + "learning_rate": 1.4199916497894411e-05, + "loss": 0.0005, + "num_tokens": 1160643474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4262183152684134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007330088321284342, + "kl": 0.012664794921875, + "learning_rate": 1.4194508805716679e-05, + "loss": 0.0005, + "num_tokens": 1161205362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4263890074251088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018653333607505075, + "kl": 0.012298583984375, + "learning_rate": 1.4189099624639729e-05, + "loss": 0.0005, + "num_tokens": 1161769298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42655969958180423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002664289369219206, + "kl": 0.0125732421875, + "learning_rate": 1.4183688956583637e-05, + "loss": 0.0005, + "num_tokens": 1162333106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42673039173849964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007376480385179323, + "kl": 0.012298583984375, + "learning_rate": 1.4178276803468983e-05, + "loss": 0.0005, + "num_tokens": 1162895890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.426901083895195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021389895769742117, + "kl": 0.0130462646484375, + "learning_rate": 1.4172863167216892e-05, + "loss": 0.0005, + "num_tokens": 1163460466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4270717760518904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000767949410774145, + "kl": 0.012359619140625, + "learning_rate": 1.4167448049749009e-05, + "loss": 0.0005, + "num_tokens": 1164021762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4272424682085858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006690085580373133, + "kl": 0.012603759765625, + "learning_rate": 1.4162031452987505e-05, + "loss": 0.0005, + "num_tokens": 1164582226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4274131603652812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003107811659853578, + "kl": 0.0128326416015625, + "learning_rate": 1.4156613378855072e-05, + "loss": 0.0005, + "num_tokens": 1165147602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4275838525219766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000350131778730654, + "kl": 0.0123291015625, + "learning_rate": 1.415119382927494e-05, + "loss": 0.0005, + "num_tokens": 1165710546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.427754544678672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004171066783787695, + "kl": 0.0126190185546875, + "learning_rate": 1.4145772806170846e-05, + "loss": 0.0005, + "num_tokens": 1166284114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42792523683536743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005043380937264213, + "kl": 0.012542724609375, + "learning_rate": 1.414035031146706e-05, + "loss": 0.0005, + "num_tokens": 1166852274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42809592899206284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012935990054722748, + "kl": 0.0120697021484375, + "learning_rate": 1.413492634708837e-05, + "loss": 0.0005, + "num_tokens": 1167417346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42826662114875824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005443039767423087, + "kl": 0.0125274658203125, + "learning_rate": 1.412950091496009e-05, + "loss": 0.0005, + "num_tokens": 1167985714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4284373133054536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000538742855063462, + "kl": 0.0125885009765625, + "learning_rate": 1.4124074017008052e-05, + "loss": 0.0005, + "num_tokens": 1168549762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.428608005462149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027480514804173505, + "kl": 0.012481689453125, + "learning_rate": 1.411864565515861e-05, + "loss": 0.0005, + "num_tokens": 1169114290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4287786976188444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008451708427480262, + "kl": 0.0129852294921875, + "learning_rate": 1.4113215831338631e-05, + "loss": 0.0005, + "num_tokens": 1169678162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4289493897755398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006111423443110814, + "kl": 0.0125274658203125, + "learning_rate": 1.4107784547475514e-05, + "loss": 0.0005, + "num_tokens": 1170245410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4291200819322352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005066422520886113, + "kl": 0.012664794921875, + "learning_rate": 1.4102351805497164e-05, + "loss": 0.0005, + "num_tokens": 1170806674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4292907740889306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005119102289164612, + "kl": 0.012359619140625, + "learning_rate": 1.4096917607332009e-05, + "loss": 0.0005, + "num_tokens": 1171373186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42946146624562603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002857524878314206, + "kl": 0.0129852294921875, + "learning_rate": 1.4091481954908992e-05, + "loss": 0.0005, + "num_tokens": 1171937074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.42963215840232144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002607966708596671, + "kl": 0.0124664306640625, + "learning_rate": 1.4086044850157578e-05, + "loss": 0.0005, + "num_tokens": 1172504082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4298028505590168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005197475293012727, + "kl": 0.012786865234375, + "learning_rate": 1.408060629500774e-05, + "loss": 0.0005, + "num_tokens": 1173067346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4299735427157122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019546069643103668, + "kl": 0.0123291015625, + "learning_rate": 1.4075166291389968e-05, + "loss": 0.0005, + "num_tokens": 1173635682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4301442348724076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005334159060668195, + "kl": 0.0129241943359375, + "learning_rate": 1.4069724841235267e-05, + "loss": 0.0005, + "num_tokens": 1174202978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.430314927029103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007279244760931765, + "kl": 0.0128631591796875, + "learning_rate": 1.4064281946475156e-05, + "loss": 0.0005, + "num_tokens": 1174766402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4304856191857984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036496867134965883, + "kl": 0.012176513671875, + "learning_rate": 1.4058837609041664e-05, + "loss": 0.0005, + "num_tokens": 1175331634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4306563113424938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027195051852809756, + "kl": 0.0126190185546875, + "learning_rate": 1.405339183086734e-05, + "loss": 0.0005, + "num_tokens": 1175901602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43082700349918923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031154832576825767, + "kl": 0.0123291015625, + "learning_rate": 1.4047944613885231e-05, + "loss": 0.0005, + "num_tokens": 1176463570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43099769565588464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000402971395810442, + "kl": 0.012451171875, + "learning_rate": 1.4042495960028912e-05, + "loss": 0.0005, + "num_tokens": 1177025410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43116838781258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000321243616366615, + "kl": 0.01348876953125, + "learning_rate": 1.403704587123245e-05, + "loss": 0.0005, + "num_tokens": 1177590722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4313390799692754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044565610263478045, + "kl": 0.0123748779296875, + "learning_rate": 1.4031594349430434e-05, + "loss": 0.0005, + "num_tokens": 1178161874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4315097721259708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032606550796747077, + "kl": 0.012725830078125, + "learning_rate": 1.4026141396557953e-05, + "loss": 0.0005, + "num_tokens": 1178731554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4316804642826662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008969935763230422, + "kl": 0.0131988525390625, + "learning_rate": 1.4020687014550615e-05, + "loss": 0.0005, + "num_tokens": 1179292194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4318511564393616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014601645776380933, + "kl": 0.0132293701171875, + "learning_rate": 1.4015231205344521e-05, + "loss": 0.0005, + "num_tokens": 1179865778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.432021848596057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005811985817196685, + "kl": 0.0118408203125, + "learning_rate": 1.4009773970876297e-05, + "loss": 0.0005, + "num_tokens": 1180433730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43219254075275243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005713469556938985, + "kl": 0.0122833251953125, + "learning_rate": 1.4004315313083052e-05, + "loss": 0.0005, + "num_tokens": 1181001650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43236323290944784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006897873461412635, + "kl": 0.012298583984375, + "learning_rate": 1.3998855233902422e-05, + "loss": 0.0005, + "num_tokens": 1181569362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4325339250661432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005133387449303322, + "kl": 0.0116119384765625, + "learning_rate": 1.3993393735272532e-05, + "loss": 0.0005, + "num_tokens": 1182141762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4327046172228386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010303199020685908, + "kl": 0.0126800537109375, + "learning_rate": 1.398793081913202e-05, + "loss": 0.0005, + "num_tokens": 1182711106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.432875309379534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012795186930333495, + "kl": 0.0119171142578125, + "learning_rate": 1.3982466487420023e-05, + "loss": 0.0005, + "num_tokens": 1183274002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4330460015362294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001653304739268719, + "kl": 0.0128326416015625, + "learning_rate": 1.3977000742076184e-05, + "loss": 0.0005, + "num_tokens": 1183836418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4332166936929248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003260027484243664, + "kl": 0.0126190185546875, + "learning_rate": 1.3971533585040639e-05, + "loss": 0.0005, + "num_tokens": 1184400914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4333873858496202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003441266849213565, + "kl": 0.01251220703125, + "learning_rate": 1.3966065018254039e-05, + "loss": 0.0005, + "num_tokens": 1184963378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43355807800631563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002757884853599108, + "kl": 0.0131988525390625, + "learning_rate": 1.3960595043657523e-05, + "loss": 0.0005, + "num_tokens": 1185526562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43372877016301103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002609852490491293, + "kl": 0.0132904052734375, + "learning_rate": 1.3955123663192736e-05, + "loss": 0.0005, + "num_tokens": 1186093586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4338994623197064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018836104860996597, + "kl": 0.012359619140625, + "learning_rate": 1.3949650878801819e-05, + "loss": 0.0005, + "num_tokens": 1186658034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4340701544764018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030139553641376723, + "kl": 0.0120391845703125, + "learning_rate": 1.3944176692427415e-05, + "loss": 0.0005, + "num_tokens": 1187225794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4342408466330972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003252177647147265, + "kl": 0.0133514404296875, + "learning_rate": 1.3938701106012664e-05, + "loss": 0.0005, + "num_tokens": 1187791058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4344115387897926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042518512258625247, + "kl": 0.0123138427734375, + "learning_rate": 1.3933224121501196e-05, + "loss": 0.0005, + "num_tokens": 1188353378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.434582230946488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003135914670347206, + "kl": 0.01251220703125, + "learning_rate": 1.3927745740837149e-05, + "loss": 0.0005, + "num_tokens": 1188917170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4347529231031834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023247288419590655, + "kl": 0.01226806640625, + "learning_rate": 1.3922265965965145e-05, + "loss": 0.0005, + "num_tokens": 1189480098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4349236152598788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036672229168969653, + "kl": 0.0131988525390625, + "learning_rate": 1.391678479883031e-05, + "loss": 0.0005, + "num_tokens": 1190043186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43509430741657423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010713133834867725, + "kl": 0.0121612548828125, + "learning_rate": 1.391130224137826e-05, + "loss": 0.0005, + "num_tokens": 1190605842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4352649995732696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005202592106919752, + "kl": 0.0127105712890625, + "learning_rate": 1.3905818295555102e-05, + "loss": 0.0005, + "num_tokens": 1191165330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.435435691729965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005391097958268443, + "kl": 0.0125732421875, + "learning_rate": 1.3900332963307444e-05, + "loss": 0.0005, + "num_tokens": 1191731554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4356063838866604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014553979553824767, + "kl": 0.0127716064453125, + "learning_rate": 1.3894846246582376e-05, + "loss": 0.0005, + "num_tokens": 1192295570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4357770760433558, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.417823656748998e-05, + "kl": 0.01226806640625, + "learning_rate": 1.3889358147327484e-05, + "loss": 0.0005, + "num_tokens": 1192859730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4359477682000512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037621029764052435, + "kl": 0.012664794921875, + "learning_rate": 1.388386866749085e-05, + "loss": 0.0005, + "num_tokens": 1193419186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4361184603567466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005276923802050657, + "kl": 0.0126953125, + "learning_rate": 1.3878377809021042e-05, + "loss": 0.0005, + "num_tokens": 1193988498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.436289152513442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010487628987186103, + "kl": 0.012359619140625, + "learning_rate": 1.3872885573867105e-05, + "loss": 0.0005, + "num_tokens": 1194551154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43645984467013743, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.84338238706388e-05, + "kl": 0.0126953125, + "learning_rate": 1.3867391963978601e-05, + "loss": 0.0005, + "num_tokens": 1195115474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4366305368268328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020380540874201302, + "kl": 0.0127410888671875, + "learning_rate": 1.386189698130555e-05, + "loss": 0.0005, + "num_tokens": 1195684914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4368012289835282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005727176487803554, + "kl": 0.0115509033203125, + "learning_rate": 1.385640062779848e-05, + "loss": 0.0005, + "num_tokens": 1196247074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4369719211402236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018970592303758072, + "kl": 0.0120849609375, + "learning_rate": 1.385090290540839e-05, + "loss": 0.0005, + "num_tokens": 1196809842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.437142613296919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002313118062387241, + "kl": 0.01251220703125, + "learning_rate": 1.3845403816086784e-05, + "loss": 0.0005, + "num_tokens": 1197378418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4373133054536144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003700811705994728, + "kl": 0.01202392578125, + "learning_rate": 1.383990336178563e-05, + "loss": 0.0005, + "num_tokens": 1197951858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4374839976103098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040822104524326, + "kl": 0.0128936767578125, + "learning_rate": 1.3834401544457398e-05, + "loss": 0.0005, + "num_tokens": 1198515090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4376546897670052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000246145425480436, + "kl": 0.012451171875, + "learning_rate": 1.3828898366055028e-05, + "loss": 0.0005, + "num_tokens": 1199080082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43782538192370063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003300034921398095, + "kl": 0.0121612548828125, + "learning_rate": 1.3823393828531958e-05, + "loss": 0.0005, + "num_tokens": 1199644066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.437996074080396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001585937356350929, + "kl": 0.01214599609375, + "learning_rate": 1.3817887933842093e-05, + "loss": 0.0005, + "num_tokens": 1200209058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4381667662370914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022365006543886216, + "kl": 0.012908935546875, + "learning_rate": 1.3812380683939828e-05, + "loss": 0.0005, + "num_tokens": 1200790114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4383374583937868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015286461030762627, + "kl": 0.01239013671875, + "learning_rate": 1.3806872080780043e-05, + "loss": 0.0005, + "num_tokens": 1201353906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4385081505504822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004128385605618239, + "kl": 0.01239013671875, + "learning_rate": 1.3801362126318085e-05, + "loss": 0.0005, + "num_tokens": 1201915842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4386788427071776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003447528403663168, + "kl": 0.012359619140625, + "learning_rate": 1.3795850822509798e-05, + "loss": 0.0005, + "num_tokens": 1202479794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.438849534863873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006621833743310019, + "kl": 0.01287841796875, + "learning_rate": 1.3790338171311488e-05, + "loss": 0.0005, + "num_tokens": 1203049250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4390202270205684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002651011553788568, + "kl": 0.0128631591796875, + "learning_rate": 1.3784824174679954e-05, + "loss": 0.0005, + "num_tokens": 1203613218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4391909191772638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009838473090753913, + "kl": 0.0124053955078125, + "learning_rate": 1.3779308834572458e-05, + "loss": 0.0005, + "num_tokens": 1204173986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4393616113339592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020292745915966555, + "kl": 0.012908935546875, + "learning_rate": 1.3773792152946754e-05, + "loss": 0.0005, + "num_tokens": 1204743426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4395323034906546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003381153081804165, + "kl": 0.0123443603515625, + "learning_rate": 1.3768274131761061e-05, + "loss": 0.0005, + "num_tokens": 1205314610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43970299564735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004910626946386222, + "kl": 0.013092041015625, + "learning_rate": 1.3762754772974076e-05, + "loss": 0.0005, + "num_tokens": 1205876898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4398736878040454, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.530056408350549e-05, + "kl": 0.0126495361328125, + "learning_rate": 1.3757234078544976e-05, + "loss": 0.0005, + "num_tokens": 1206439570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4400443799607408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021599544539673728, + "kl": 0.0132598876953125, + "learning_rate": 1.3751712050433408e-05, + "loss": 0.0005, + "num_tokens": 1207006946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4402150721174362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004368446194605506, + "kl": 0.0125274658203125, + "learning_rate": 1.3746188690599488e-05, + "loss": 0.0005, + "num_tokens": 1207572258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4403857642741316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002592505286985359, + "kl": 0.012603759765625, + "learning_rate": 1.3740664001003816e-05, + "loss": 0.0005, + "num_tokens": 1208136930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.440556456430827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000412251067496958, + "kl": 0.0122528076171875, + "learning_rate": 1.3735137983607452e-05, + "loss": 0.0005, + "num_tokens": 1208705490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4407271485875224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000495966391276299, + "kl": 0.0121917724609375, + "learning_rate": 1.3729610640371935e-05, + "loss": 0.0005, + "num_tokens": 1209278818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4408978407442178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001210132119375693, + "kl": 0.0127105712890625, + "learning_rate": 1.3724081973259274e-05, + "loss": 0.0005, + "num_tokens": 1209844354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4410685329009132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006240145330256852, + "kl": 0.012542724609375, + "learning_rate": 1.3718551984231938e-05, + "loss": 0.0005, + "num_tokens": 1210410306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4412392250576086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004431380371425208, + "kl": 0.0122833251953125, + "learning_rate": 1.3713020675252885e-05, + "loss": 0.0005, + "num_tokens": 1210977634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.441409917214304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000891449935585749, + "kl": 0.01214599609375, + "learning_rate": 1.3707488048285522e-05, + "loss": 0.0005, + "num_tokens": 1211541266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4415806093709994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012361637694935366, + "kl": 0.0132904052734375, + "learning_rate": 1.3701954105293735e-05, + "loss": 0.0005, + "num_tokens": 1212107762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4417513015276948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021280615108811097, + "kl": 0.013031005859375, + "learning_rate": 1.3696418848241874e-05, + "loss": 0.0005, + "num_tokens": 1212670210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4419219936843902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005851258843946767, + "kl": 0.0129241943359375, + "learning_rate": 1.3690882279094758e-05, + "loss": 0.0005, + "num_tokens": 1213234914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4420926858410856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028391507953394224, + "kl": 0.013153076171875, + "learning_rate": 1.3685344399817659e-05, + "loss": 0.0005, + "num_tokens": 1213800834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.442263377997781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016024692755470262, + "kl": 0.01220703125, + "learning_rate": 1.3679805212376338e-05, + "loss": 0.0005, + "num_tokens": 1214364738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4424340701544764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039122164666113336, + "kl": 0.01251220703125, + "learning_rate": 1.3674264718736994e-05, + "loss": 0.0005, + "num_tokens": 1214942866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4426047623111718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028336572944663053, + "kl": 0.0124969482421875, + "learning_rate": 1.3668722920866316e-05, + "loss": 0.0005, + "num_tokens": 1215507922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4427754544678672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023886853030089585, + "kl": 0.0122528076171875, + "learning_rate": 1.3663179820731426e-05, + "loss": 0.0005, + "num_tokens": 1216072802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4429461466245626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005488894160031731, + "kl": 0.012054443359375, + "learning_rate": 1.3657635420299939e-05, + "loss": 0.0005, + "num_tokens": 1216634002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.443116838781258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000786740641914573, + "kl": 0.011810302734375, + "learning_rate": 1.3652089721539908e-05, + "loss": 0.0005, + "num_tokens": 1217204530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4432875309379534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025394480642503356, + "kl": 0.0125885009765625, + "learning_rate": 1.3646542726419857e-05, + "loss": 0.0005, + "num_tokens": 1217767202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4434582230946488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016038408024180584, + "kl": 0.013092041015625, + "learning_rate": 1.3640994436908775e-05, + "loss": 0.0005, + "num_tokens": 1218331122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4436289152513442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000523659277578582, + "kl": 0.0128021240234375, + "learning_rate": 1.3635444854976097e-05, + "loss": 0.0005, + "num_tokens": 1218900690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4437996074080396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006128194336145979, + "kl": 0.0128021240234375, + "learning_rate": 1.3629893982591728e-05, + "loss": 0.0005, + "num_tokens": 1219464178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.443970299564735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012998217594167977, + "kl": 0.0123748779296875, + "learning_rate": 1.3624341821726028e-05, + "loss": 0.0005, + "num_tokens": 1220039714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4441409917214304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016130180490874917, + "kl": 0.0125885009765625, + "learning_rate": 1.3618788374349814e-05, + "loss": 0.0005, + "num_tokens": 1220604706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4443116838781258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032171963392222855, + "kl": 0.0131378173828125, + "learning_rate": 1.3613233642434355e-05, + "loss": 0.0005, + "num_tokens": 1221170130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4444823760348212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029022920437762553, + "kl": 0.0123443603515625, + "learning_rate": 1.3607677627951383e-05, + "loss": 0.0005, + "num_tokens": 1221740610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4446530681915166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007795276850872797, + "kl": 0.0123748779296875, + "learning_rate": 1.3602120332873085e-05, + "loss": 0.0005, + "num_tokens": 1222304738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.444823760348212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006725142972325335, + "kl": 0.0123443603515625, + "learning_rate": 1.3596561759172096e-05, + "loss": 0.0005, + "num_tokens": 1222870562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4449944525049074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017513537917731445, + "kl": 0.0125579833984375, + "learning_rate": 1.3591001908821512e-05, + "loss": 0.0005, + "num_tokens": 1223436594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4451651446616028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046939946903060305, + "kl": 0.0125885009765625, + "learning_rate": 1.3585440783794878e-05, + "loss": 0.0005, + "num_tokens": 1224000642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4453358368182982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042661750449220085, + "kl": 0.0126800537109375, + "learning_rate": 1.3579878386066194e-05, + "loss": 0.0005, + "num_tokens": 1224562674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4455065289749936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005477930482736724, + "kl": 0.0128631591796875, + "learning_rate": 1.3574314717609906e-05, + "loss": 0.0005, + "num_tokens": 1225128002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.445677221131689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007031754142527094, + "kl": 0.012603759765625, + "learning_rate": 1.3568749780400921e-05, + "loss": 0.0005, + "num_tokens": 1225693314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4458479132883844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002448938136877999, + "kl": 0.0128326416015625, + "learning_rate": 1.3563183576414587e-05, + "loss": 0.0005, + "num_tokens": 1226257250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4460186054450798, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003766858865533638, + "kl": 0.0123138427734375, + "learning_rate": 1.3557616107626707e-05, + "loss": 0.0005, + "num_tokens": 1226827714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4461892976017752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005357430884660282, + "kl": 0.01190185546875, + "learning_rate": 1.3552047376013533e-05, + "loss": 0.0005, + "num_tokens": 1227392162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4463599897584706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016364385967589446, + "kl": 0.0123443603515625, + "learning_rate": 1.3546477383551762e-05, + "loss": 0.0005, + "num_tokens": 1227955346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.446530681915166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000584707493587891, + "kl": 0.0118408203125, + "learning_rate": 1.3540906132218537e-05, + "loss": 0.0005, + "num_tokens": 1228521986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4467013740718614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015006805583244986, + "kl": 0.012115478515625, + "learning_rate": 1.353533362399146e-05, + "loss": 0.0005, + "num_tokens": 1229086498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4468720662285568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000631195842893714, + "kl": 0.01287841796875, + "learning_rate": 1.352975986084856e-05, + "loss": 0.0005, + "num_tokens": 1229650930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4470427583852522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004999200849894918, + "kl": 0.01251220703125, + "learning_rate": 1.3524184844768328e-05, + "loss": 0.0005, + "num_tokens": 1230224946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4472134505419476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001750469965463965, + "kl": 0.012847900390625, + "learning_rate": 1.351860857772969e-05, + "loss": 0.0005, + "num_tokens": 1230794482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.447384142698643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000270363997748972, + "kl": 0.0119171142578125, + "learning_rate": 1.3513031061712026e-05, + "loss": 0.0005, + "num_tokens": 1231365250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4475548348553384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006845137321206288, + "kl": 0.0126190185546875, + "learning_rate": 1.3507452298695143e-05, + "loss": 0.0005, + "num_tokens": 1231934706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4477255270120338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00140489376911152, + "kl": 0.012054443359375, + "learning_rate": 1.350187229065931e-05, + "loss": 0.0005, + "num_tokens": 1232499746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4478962191687292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006753421555614099, + "kl": 0.013519287109375, + "learning_rate": 1.3496291039585221e-05, + "loss": 0.0005, + "num_tokens": 1233072018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4480669113254246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003001356869657871, + "kl": 0.0125579833984375, + "learning_rate": 1.3490708547454024e-05, + "loss": 0.0005, + "num_tokens": 1233630546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.44823760348212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005194255966966473, + "kl": 0.012359619140625, + "learning_rate": 1.3485124816247297e-05, + "loss": 0.0005, + "num_tokens": 1234198194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4484082956388154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029095929388560745, + "kl": 0.0131072998046875, + "learning_rate": 1.3479539847947067e-05, + "loss": 0.0005, + "num_tokens": 1234765970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4485789877955108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000401977470128087, + "kl": 0.0128021240234375, + "learning_rate": 1.3473953644535798e-05, + "loss": 0.0005, + "num_tokens": 1235332242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4487496799522062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000323659350684284, + "kl": 0.0122222900390625, + "learning_rate": 1.3468366207996386e-05, + "loss": 0.0005, + "num_tokens": 1235905890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4489203721089016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003398578501491677, + "kl": 0.012786865234375, + "learning_rate": 1.3462777540312171e-05, + "loss": 0.0005, + "num_tokens": 1236475922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.449091064265597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011792761819822777, + "kl": 0.012939453125, + "learning_rate": 1.3457187643466928e-05, + "loss": 0.0005, + "num_tokens": 1237047906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4492617564222924, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.510062194827525e-05, + "kl": 0.011871337890625, + "learning_rate": 1.345159651944487e-05, + "loss": 0.0005, + "num_tokens": 1237612994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4494324485789878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025012430321500465, + "kl": 0.0126495361328125, + "learning_rate": 1.3446004170230641e-05, + "loss": 0.0005, + "num_tokens": 1238176066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4496031407356832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003084028527060576, + "kl": 0.012725830078125, + "learning_rate": 1.3440410597809325e-05, + "loss": 0.0005, + "num_tokens": 1238739586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4497738328923786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003856079604805124, + "kl": 0.012939453125, + "learning_rate": 1.3434815804166441e-05, + "loss": 0.0005, + "num_tokens": 1239301522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.449944525049074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002186130657954947, + "kl": 0.0124053955078125, + "learning_rate": 1.3429219791287936e-05, + "loss": 0.0005, + "num_tokens": 1239866706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4501152172057694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039501175152631575, + "kl": 0.0122222900390625, + "learning_rate": 1.3423622561160193e-05, + "loss": 0.0005, + "num_tokens": 1240433746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4502859093624648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002173078904948336, + "kl": 0.0125579833984375, + "learning_rate": 1.3418024115770024e-05, + "loss": 0.0005, + "num_tokens": 1241000498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45045660151916017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019132390701861955, + "kl": 0.0128936767578125, + "learning_rate": 1.341242445710468e-05, + "loss": 0.0005, + "num_tokens": 1241561554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4506272936758556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002295049013616917, + "kl": 0.0120697021484375, + "learning_rate": 1.3406823587151834e-05, + "loss": 0.0005, + "num_tokens": 1242122834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.450797985832551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020372892729602745, + "kl": 0.012481689453125, + "learning_rate": 1.3401221507899594e-05, + "loss": 0.0005, + "num_tokens": 1242689810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4509686779892464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003920002341289205, + "kl": 0.012298583984375, + "learning_rate": 1.3395618221336496e-05, + "loss": 0.0005, + "num_tokens": 1243258514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4511393701459418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005718406593471716, + "kl": 0.0125885009765625, + "learning_rate": 1.3390013729451502e-05, + "loss": 0.0005, + "num_tokens": 1243823522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4513100623026372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010209298049607037, + "kl": 0.0128173828125, + "learning_rate": 1.338440803423401e-05, + "loss": 0.0005, + "num_tokens": 1244385234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4514807544593326, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.590343202001737e-05, + "kl": 0.0128021240234375, + "learning_rate": 1.3378801137673834e-05, + "loss": 0.0005, + "num_tokens": 1244950546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.451651446616028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003979485334832125, + "kl": 0.01202392578125, + "learning_rate": 1.3373193041761218e-05, + "loss": 0.0005, + "num_tokens": 1245525538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45182213877272337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005166674118849939, + "kl": 0.0127410888671875, + "learning_rate": 1.3367583748486846e-05, + "loss": 0.0005, + "num_tokens": 1246088242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4519928309294188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034923357618916486, + "kl": 0.0128936767578125, + "learning_rate": 1.3361973259841798e-05, + "loss": 0.0005, + "num_tokens": 1246650722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4521635230861142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017179030380502444, + "kl": 0.0126800537109375, + "learning_rate": 1.3356361577817609e-05, + "loss": 0.0005, + "num_tokens": 1247217234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4523342152428096, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.851382781712642e-05, + "kl": 0.0126190185546875, + "learning_rate": 1.3350748704406213e-05, + "loss": 0.0005, + "num_tokens": 1247785186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.452504907399505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000673847577761576, + "kl": 0.0129547119140625, + "learning_rate": 1.3345134641599987e-05, + "loss": 0.0005, + "num_tokens": 1248350146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4526755995562004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000469192652020704, + "kl": 0.0130462646484375, + "learning_rate": 1.3339519391391711e-05, + "loss": 0.0005, + "num_tokens": 1248916114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4528462917128958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006629248853252338, + "kl": 0.0127716064453125, + "learning_rate": 1.3333902955774605e-05, + "loss": 0.0005, + "num_tokens": 1249479522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4530169838695912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025380004237447794, + "kl": 0.0126800537109375, + "learning_rate": 1.332828533674229e-05, + "loss": 0.0005, + "num_tokens": 1250047986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45318767602628657, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.455038780835967e-05, + "kl": 0.0125732421875, + "learning_rate": 1.3322666536288833e-05, + "loss": 0.0005, + "num_tokens": 1250615858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.453358368182982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001019387426783066, + "kl": 0.011871337890625, + "learning_rate": 1.331704655640869e-05, + "loss": 0.0005, + "num_tokens": 1251187154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4535290603396774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006090130373535962, + "kl": 0.0122528076171875, + "learning_rate": 1.3311425399096762e-05, + "loss": 0.0005, + "num_tokens": 1251756754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4536997524963728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005142528667445724, + "kl": 0.012451171875, + "learning_rate": 1.3305803066348352e-05, + "loss": 0.0005, + "num_tokens": 1252318498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4538704446530682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000181191681639945, + "kl": 0.01275634765625, + "learning_rate": 1.3300179560159186e-05, + "loss": 0.0005, + "num_tokens": 1252877106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4540411368097636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012123560859628134, + "kl": 0.0132598876953125, + "learning_rate": 1.3294554882525405e-05, + "loss": 0.0005, + "num_tokens": 1253436802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.454211828966459, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003562475510415279, + "kl": 0.012298583984375, + "learning_rate": 1.328892903544357e-05, + "loss": 0.0005, + "num_tokens": 1254002754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4543825211231544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011367629236363958, + "kl": 0.0126190185546875, + "learning_rate": 1.3283302020910647e-05, + "loss": 0.0005, + "num_tokens": 1254573170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45455321327984977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006928169961743089, + "kl": 0.0129547119140625, + "learning_rate": 1.327767384092403e-05, + "loss": 0.0005, + "num_tokens": 1255141378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4547239054365452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007568582171135729, + "kl": 0.0125732421875, + "learning_rate": 1.3272044497481516e-05, + "loss": 0.0005, + "num_tokens": 1255710338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4548945975932406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040733493511932765, + "kl": 0.0124359130859375, + "learning_rate": 1.326641399258132e-05, + "loss": 0.0005, + "num_tokens": 1256273346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.455065289749936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000319487018726, + "kl": 0.0126190185546875, + "learning_rate": 1.3260782328222066e-05, + "loss": 0.0005, + "num_tokens": 1256847778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4552359819066314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004070271731729146, + "kl": 0.012420654296875, + "learning_rate": 1.3255149506402793e-05, + "loss": 0.0005, + "num_tokens": 1257413218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4554066740633268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001587458741486399, + "kl": 0.0127716064453125, + "learning_rate": 1.324951552912295e-05, + "loss": 0.0005, + "num_tokens": 1257977442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4555773662200222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002680317517680467, + "kl": 0.012420654296875, + "learning_rate": 1.3243880398382394e-05, + "loss": 0.0005, + "num_tokens": 1258540386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4557480583767176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003831583824165688, + "kl": 0.012359619140625, + "learning_rate": 1.3238244116181396e-05, + "loss": 0.0005, + "num_tokens": 1259103954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45591875053341296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001960671692443093, + "kl": 0.0129852294921875, + "learning_rate": 1.3232606684520627e-05, + "loss": 0.0005, + "num_tokens": 1259669186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45608944269010837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019965710798332712, + "kl": 0.012542724609375, + "learning_rate": 1.322696810540118e-05, + "loss": 0.0005, + "num_tokens": 1260240770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4562601348468038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026824071016840715, + "kl": 0.0125732421875, + "learning_rate": 1.322132838082454e-05, + "loss": 0.0005, + "num_tokens": 1260806674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4564308270034992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004400997908901098, + "kl": 0.0126953125, + "learning_rate": 1.3215687512792607e-05, + "loss": 0.0005, + "num_tokens": 1261374354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4566015191601946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022401531994856283, + "kl": 0.0117950439453125, + "learning_rate": 1.3210045503307689e-05, + "loss": 0.0005, + "num_tokens": 1261941234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45677221131689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034555589658925374, + "kl": 0.0127716064453125, + "learning_rate": 1.320440235437249e-05, + "loss": 0.0005, + "num_tokens": 1262505282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4569429034735854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012451558136951282, + "kl": 0.012725830078125, + "learning_rate": 1.3198758067990132e-05, + "loss": 0.0005, + "num_tokens": 1263065810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4571135956302808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028252064901719114, + "kl": 0.012481689453125, + "learning_rate": 1.3193112646164124e-05, + "loss": 0.0005, + "num_tokens": 1263627634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45728428778697616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001703249826642705, + "kl": 0.0120697021484375, + "learning_rate": 1.3187466090898396e-05, + "loss": 0.0005, + "num_tokens": 1264194402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45745497994367157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003497718703344937, + "kl": 0.012939453125, + "learning_rate": 1.3181818404197262e-05, + "loss": 0.0005, + "num_tokens": 1264763938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.457625672100367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005646179604629976, + "kl": 0.0123748779296875, + "learning_rate": 1.3176169588065456e-05, + "loss": 0.0005, + "num_tokens": 1265330274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4577963642570624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010329805015968992, + "kl": 0.01275634765625, + "learning_rate": 1.3170519644508095e-05, + "loss": 0.0005, + "num_tokens": 1265898514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4579670564137578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001715444300592576, + "kl": 0.0122222900390625, + "learning_rate": 1.3164868575530714e-05, + "loss": 0.0005, + "num_tokens": 1266469202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4581377485704532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011500154150842754, + "kl": 0.0126190185546875, + "learning_rate": 1.3159216383139228e-05, + "loss": 0.0005, + "num_tokens": 1267037810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4583084407271486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014192698678302824, + "kl": 0.0127105712890625, + "learning_rate": 1.3153563069339975e-05, + "loss": 0.0005, + "num_tokens": 1267603794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.458479132883844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028681379532390213, + "kl": 0.0128936767578125, + "learning_rate": 1.3147908636139662e-05, + "loss": 0.0005, + "num_tokens": 1268165106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45864982504053936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021109259419663668, + "kl": 0.012664794921875, + "learning_rate": 1.314225308554542e-05, + "loss": 0.0005, + "num_tokens": 1268732674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45882051719723477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013826281926305374, + "kl": 0.012603759765625, + "learning_rate": 1.313659641956476e-05, + "loss": 0.0005, + "num_tokens": 1269296226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4589912093539302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018992629721665974, + "kl": 0.0133819580078125, + "learning_rate": 1.3130938640205595e-05, + "loss": 0.0005, + "num_tokens": 1269857378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4591619015106256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013264429234188168, + "kl": 0.0120849609375, + "learning_rate": 1.3125279749476235e-05, + "loss": 0.0005, + "num_tokens": 1270421122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.459332593667321, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003648387965542727, + "kl": 0.012451171875, + "learning_rate": 1.3119619749385379e-05, + "loss": 0.0005, + "num_tokens": 1270984098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4595032858240164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033475529156521837, + "kl": 0.0125579833984375, + "learning_rate": 1.3113958641942124e-05, + "loss": 0.0005, + "num_tokens": 1271547474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4596739779807118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003641321832709645, + "kl": 0.0121307373046875, + "learning_rate": 1.310829642915596e-05, + "loss": 0.0005, + "num_tokens": 1272121762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4598446701374072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004164428024942463, + "kl": 0.012451171875, + "learning_rate": 1.3102633113036764e-05, + "loss": 0.0005, + "num_tokens": 1272690274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46001536229410256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002544025400389166, + "kl": 0.01263427734375, + "learning_rate": 1.3096968695594814e-05, + "loss": 0.0005, + "num_tokens": 1273257234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46018605445079797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026528141431452336, + "kl": 0.0126190185546875, + "learning_rate": 1.309130317884077e-05, + "loss": 0.0005, + "num_tokens": 1273818194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4603567466074934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004297463550703017, + "kl": 0.0124969482421875, + "learning_rate": 1.3085636564785686e-05, + "loss": 0.0005, + "num_tokens": 1274387186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4605274387641888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001961122969383068, + "kl": 0.0126953125, + "learning_rate": 1.307996885544101e-05, + "loss": 0.0005, + "num_tokens": 1274953442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4606981309208842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002418995824957817, + "kl": 0.012542724609375, + "learning_rate": 1.3074300052818567e-05, + "loss": 0.0005, + "num_tokens": 1275530322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4608688230775796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014924068502101536, + "kl": 0.0125274658203125, + "learning_rate": 1.3068630158930583e-05, + "loss": 0.0005, + "num_tokens": 1276093378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.461039515234275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001421850613712845, + "kl": 0.0123443603515625, + "learning_rate": 1.3062959175789665e-05, + "loss": 0.0005, + "num_tokens": 1276659634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4612102073909704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026782183113081387, + "kl": 0.0125732421875, + "learning_rate": 1.3057287105408802e-05, + "loss": 0.0005, + "num_tokens": 1277222770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46138089954766576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006804471536521008, + "kl": 0.0123291015625, + "learning_rate": 1.3051613949801382e-05, + "loss": 0.0005, + "num_tokens": 1277784210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46155159170436116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007715604665830145, + "kl": 0.01226806640625, + "learning_rate": 1.3045939710981165e-05, + "loss": 0.0005, + "num_tokens": 1278354466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46172228386105657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000570518093943216, + "kl": 0.0126800537109375, + "learning_rate": 1.3040264390962305e-05, + "loss": 0.0005, + "num_tokens": 1278926226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.461892976017752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006590212478127305, + "kl": 0.0133056640625, + "learning_rate": 1.3034587991759331e-05, + "loss": 0.0005, + "num_tokens": 1279495202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4620636681744474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012197290801650537, + "kl": 0.0119781494140625, + "learning_rate": 1.3028910515387164e-05, + "loss": 0.0005, + "num_tokens": 1280061186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4622343603311428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014513590149339416, + "kl": 0.0123443603515625, + "learning_rate": 1.30232319638611e-05, + "loss": 0.0005, + "num_tokens": 1280632226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4624050524878382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006155735710588167, + "kl": 0.01239013671875, + "learning_rate": 1.3017552339196821e-05, + "loss": 0.0005, + "num_tokens": 1281198034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4625757446445336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021048045719529037, + "kl": 0.012908935546875, + "learning_rate": 1.3011871643410391e-05, + "loss": 0.0005, + "num_tokens": 1281760402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.462746436801229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022726656798530053, + "kl": 0.012451171875, + "learning_rate": 1.3006189878518249e-05, + "loss": 0.0005, + "num_tokens": 1282326210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46291712895792436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008866221886420129, + "kl": 0.0132904052734375, + "learning_rate": 1.3000507046537216e-05, + "loss": 0.0005, + "num_tokens": 1282895058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46308782111461977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045149013605791724, + "kl": 0.012847900390625, + "learning_rate": 1.2994823149484494e-05, + "loss": 0.0005, + "num_tokens": 1283454370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4632585132713152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032664929329263056, + "kl": 0.012542724609375, + "learning_rate": 1.2989138189377664e-05, + "loss": 0.0005, + "num_tokens": 1284022018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4634292054280106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003420199351524296, + "kl": 0.01287841796875, + "learning_rate": 1.2983452168234676e-05, + "loss": 0.0005, + "num_tokens": 1284588482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.463599897584706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023942583662046605, + "kl": 0.012542724609375, + "learning_rate": 1.2977765088073863e-05, + "loss": 0.0005, + "num_tokens": 1285160786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4637705897414014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007831447089263067, + "kl": 0.0127716064453125, + "learning_rate": 1.2972076950913937e-05, + "loss": 0.0005, + "num_tokens": 1285732466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4639412818980968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031111666943983674, + "kl": 0.0127716064453125, + "learning_rate": 1.2966387758773979e-05, + "loss": 0.0005, + "num_tokens": 1286301378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4641119740547922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015856590260142878, + "kl": 0.01263427734375, + "learning_rate": 1.2960697513673444e-05, + "loss": 0.0005, + "num_tokens": 1286876626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46428266621148756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000163866991723372, + "kl": 0.0124664306640625, + "learning_rate": 1.2955006217632172e-05, + "loss": 0.0005, + "num_tokens": 1287456690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46445335836818297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003624128011419293, + "kl": 0.01214599609375, + "learning_rate": 1.2949313872670356e-05, + "loss": 0.0005, + "num_tokens": 1288023122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4646240505248784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006613921776806138, + "kl": 0.0122528076171875, + "learning_rate": 1.2943620480808587e-05, + "loss": 0.0005, + "num_tokens": 1288592722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4647947426815738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022897660059493355, + "kl": 0.0118560791015625, + "learning_rate": 1.2937926044067802e-05, + "loss": 0.0005, + "num_tokens": 1289165650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4649654348382692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003574213044493799, + "kl": 0.012786865234375, + "learning_rate": 1.2932230564469324e-05, + "loss": 0.0005, + "num_tokens": 1289730850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4651361269949646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000204847443219112, + "kl": 0.013031005859375, + "learning_rate": 1.2926534044034841e-05, + "loss": 0.0005, + "num_tokens": 1290294034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46530681915166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004723801498372104, + "kl": 0.0123443603515625, + "learning_rate": 1.2920836484786419e-05, + "loss": 0.0005, + "num_tokens": 1290867874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4654775113083554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025919491653519005, + "kl": 0.012054443359375, + "learning_rate": 1.2915137888746478e-05, + "loss": 0.0005, + "num_tokens": 1291435346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46564820346505076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009979780382551132, + "kl": 0.0132904052734375, + "learning_rate": 1.2909438257937819e-05, + "loss": 0.0005, + "num_tokens": 1292002370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46581889562174617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003922971631858762, + "kl": 0.0129852294921875, + "learning_rate": 1.2903737594383601e-05, + "loss": 0.0005, + "num_tokens": 1292573890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46598958777844157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003056929689088674, + "kl": 0.0124969482421875, + "learning_rate": 1.2898035900107356e-05, + "loss": 0.0005, + "num_tokens": 1293139474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.466160279935137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044024090240083846, + "kl": 0.012786865234375, + "learning_rate": 1.289233317713298e-05, + "loss": 0.0005, + "num_tokens": 1293704850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4663309720918324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036985710986809874, + "kl": 0.0124053955078125, + "learning_rate": 1.2886629427484734e-05, + "loss": 0.0005, + "num_tokens": 1294268322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4665016642485278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004694870039234508, + "kl": 0.0128173828125, + "learning_rate": 1.2880924653187243e-05, + "loss": 0.0005, + "num_tokens": 1294832306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4666723564052232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020271649560586595, + "kl": 0.0127716064453125, + "learning_rate": 1.2875218856265494e-05, + "loss": 0.0005, + "num_tokens": 1295392946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4668430485619186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039320921469659286, + "kl": 0.0121002197265625, + "learning_rate": 1.2869512038744843e-05, + "loss": 0.0005, + "num_tokens": 1295953874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46701374071861396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015946326080160161, + "kl": 0.0128021240234375, + "learning_rate": 1.2863804202650998e-05, + "loss": 0.0005, + "num_tokens": 1296519634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46718443287530936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008699758788632805, + "kl": 0.0124053955078125, + "learning_rate": 1.285809535001004e-05, + "loss": 0.0005, + "num_tokens": 1297084242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46735512503200477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008733465609601263, + "kl": 0.0124664306640625, + "learning_rate": 1.2852385482848404e-05, + "loss": 0.0005, + "num_tokens": 1297651234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4675258171887002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009353562644868318, + "kl": 0.0129547119140625, + "learning_rate": 1.2846674603192888e-05, + "loss": 0.0005, + "num_tokens": 1298217362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4676965093453956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037730539220196777, + "kl": 0.012481689453125, + "learning_rate": 1.2840962713070645e-05, + "loss": 0.0005, + "num_tokens": 1298790050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.467867201502091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007936228232821657, + "kl": 0.012481689453125, + "learning_rate": 1.2835249814509194e-05, + "loss": 0.0005, + "num_tokens": 1299352722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4680378936587864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005071928455339717, + "kl": 0.0126953125, + "learning_rate": 1.2829535909536403e-05, + "loss": 0.0005, + "num_tokens": 1299914450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4682085858154818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019518695234072639, + "kl": 0.0123291015625, + "learning_rate": 1.2823821000180508e-05, + "loss": 0.0005, + "num_tokens": 1300475986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46837927797217715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001784170075332107, + "kl": 0.0132293701171875, + "learning_rate": 1.2818105088470088e-05, + "loss": 0.0005, + "num_tokens": 1301036658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46854997012887256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032593089910651867, + "kl": 0.0122833251953125, + "learning_rate": 1.2812388176434091e-05, + "loss": 0.0005, + "num_tokens": 1301601602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46872066228556797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005892248482193508, + "kl": 0.012969970703125, + "learning_rate": 1.2806670266101814e-05, + "loss": 0.0005, + "num_tokens": 1302162434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4688913544422634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006441106551241819, + "kl": 0.01239013671875, + "learning_rate": 1.2800951359502907e-05, + "loss": 0.0005, + "num_tokens": 1302727954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4690620465989588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018147828797375587, + "kl": 0.0128936767578125, + "learning_rate": 1.2795231458667374e-05, + "loss": 0.0005, + "num_tokens": 1303292194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4692327387556542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009365204933535098, + "kl": 0.0123748779296875, + "learning_rate": 1.2789510565625576e-05, + "loss": 0.0005, + "num_tokens": 1303857858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4694034309123496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005358470783542004, + "kl": 0.012847900390625, + "learning_rate": 1.2783788682408225e-05, + "loss": 0.0005, + "num_tokens": 1304427634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.469574123069045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005168059001249472, + "kl": 0.012664794921875, + "learning_rate": 1.2778065811046376e-05, + "loss": 0.0005, + "num_tokens": 1304996002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46974481522574035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005994123225261098, + "kl": 0.012542724609375, + "learning_rate": 1.2772341953571452e-05, + "loss": 0.0005, + "num_tokens": 1305559298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.46991550738243576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000633763353302406, + "kl": 0.0124053955078125, + "learning_rate": 1.2766617112015207e-05, + "loss": 0.0005, + "num_tokens": 1306123170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47008619953913117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007704493973419907, + "kl": 0.0127410888671875, + "learning_rate": 1.2760891288409759e-05, + "loss": 0.0005, + "num_tokens": 1306687250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4702568916958266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008583799247102028, + "kl": 0.012725830078125, + "learning_rate": 1.2755164484787562e-05, + "loss": 0.0005, + "num_tokens": 1307252898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.470427583852522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000631276606555094, + "kl": 0.0125885009765625, + "learning_rate": 1.2749436703181432e-05, + "loss": 0.0005, + "num_tokens": 1307822258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4705982760092174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006336755759997684, + "kl": 0.0128936767578125, + "learning_rate": 1.274370794562452e-05, + "loss": 0.0005, + "num_tokens": 1308389410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4707689681659128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005560187523759716, + "kl": 0.01263427734375, + "learning_rate": 1.2737978214150331e-05, + "loss": 0.0005, + "num_tokens": 1308950994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4709396603226082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006667322764493556, + "kl": 0.012237548828125, + "learning_rate": 1.2732247510792707e-05, + "loss": 0.0005, + "num_tokens": 1309519202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47111035247930355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004107441012289652, + "kl": 0.012603759765625, + "learning_rate": 1.2726515837585852e-05, + "loss": 0.0005, + "num_tokens": 1310095474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47128104463599896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006877042305673429, + "kl": 0.012725830078125, + "learning_rate": 1.272078319656429e-05, + "loss": 0.0005, + "num_tokens": 1310669554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47145173679269436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010020164690207733, + "kl": 0.0119476318359375, + "learning_rate": 1.2715049589762909e-05, + "loss": 0.0005, + "num_tokens": 1311235410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47162242894938977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011229862537323106, + "kl": 0.01251220703125, + "learning_rate": 1.2709315019216933e-05, + "loss": 0.0005, + "num_tokens": 1311800674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4717931211060852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002377801733069572, + "kl": 0.0123443603515625, + "learning_rate": 1.2703579486961925e-05, + "loss": 0.0005, + "num_tokens": 1312364418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4719638132627806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005665016421321039, + "kl": 0.0124969482421875, + "learning_rate": 1.2697842995033793e-05, + "loss": 0.0005, + "num_tokens": 1312927778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.472134505419476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016289231132530786, + "kl": 0.0125732421875, + "learning_rate": 1.2692105545468785e-05, + "loss": 0.0005, + "num_tokens": 1313489010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4723051975761714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044066056422479823, + "kl": 0.0130767822265625, + "learning_rate": 1.2686367140303489e-05, + "loss": 0.0005, + "num_tokens": 1314051986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47247588973286675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006843591437035652, + "kl": 0.01275634765625, + "learning_rate": 1.2680627781574828e-05, + "loss": 0.0005, + "num_tokens": 1314618050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47264658188956216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004456020012813186, + "kl": 0.012786865234375, + "learning_rate": 1.267488747132007e-05, + "loss": 0.0005, + "num_tokens": 1315188882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47281727404625756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016936181737379482, + "kl": 0.0121002197265625, + "learning_rate": 1.2669146211576817e-05, + "loss": 0.0005, + "num_tokens": 1315757330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47298796620295297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006303058649055473, + "kl": 0.0126953125, + "learning_rate": 1.2663404004383012e-05, + "loss": 0.0005, + "num_tokens": 1316321250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4731586583596484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027353971248606315, + "kl": 0.01251220703125, + "learning_rate": 1.2657660851776928e-05, + "loss": 0.0005, + "num_tokens": 1316886258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4733293505163438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038544305022295837, + "kl": 0.0136260986328125, + "learning_rate": 1.2651916755797176e-05, + "loss": 0.0005, + "num_tokens": 1317453250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4735000426730392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025277240238592516, + "kl": 0.012420654296875, + "learning_rate": 1.2646171718482706e-05, + "loss": 0.0005, + "num_tokens": 1318018706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4736707348297346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013402388181487802, + "kl": 0.0125274658203125, + "learning_rate": 1.2640425741872796e-05, + "loss": 0.0005, + "num_tokens": 1318582786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47384142698642995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000947383317718779, + "kl": 0.012908935546875, + "learning_rate": 1.263467882800706e-05, + "loss": 0.0005, + "num_tokens": 1319146658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47401211914312535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005928542676734397, + "kl": 0.012542724609375, + "learning_rate": 1.262893097892545e-05, + "loss": 0.0005, + "num_tokens": 1319709666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47418281129982076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033914382412213455, + "kl": 0.011810302734375, + "learning_rate": 1.262318219666824e-05, + "loss": 0.0005, + "num_tokens": 1320283314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47435350345651617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003312737660579299, + "kl": 0.012298583984375, + "learning_rate": 1.261743248327604e-05, + "loss": 0.0005, + "num_tokens": 1320843794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4745241956132116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028483437579821934, + "kl": 0.0125885009765625, + "learning_rate": 1.2611681840789796e-05, + "loss": 0.0005, + "num_tokens": 1321409266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.474694887769907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001965296998941878, + "kl": 0.01275634765625, + "learning_rate": 1.2605930271250771e-05, + "loss": 0.0005, + "num_tokens": 1321978962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4748655799266024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005045119491702444, + "kl": 0.0123443603515625, + "learning_rate": 1.2600177776700573e-05, + "loss": 0.0005, + "num_tokens": 1322546866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4750362720832978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015861837296713788, + "kl": 0.012176513671875, + "learning_rate": 1.259442435918112e-05, + "loss": 0.0005, + "num_tokens": 1323112018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47520696423999315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022399525187602847, + "kl": 0.0134429931640625, + "learning_rate": 1.2588670020734677e-05, + "loss": 0.0005, + "num_tokens": 1323680370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47537765639668855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019630201872156947, + "kl": 0.0123443603515625, + "learning_rate": 1.2582914763403817e-05, + "loss": 0.0005, + "num_tokens": 1324248482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47554834855338396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011090173735884683, + "kl": 0.0130615234375, + "learning_rate": 1.2577158589231462e-05, + "loss": 0.0005, + "num_tokens": 1324816242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47571904071007937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001299834279979362, + "kl": 0.0126495361328125, + "learning_rate": 1.2571401500260829e-05, + "loss": 0.0005, + "num_tokens": 1325381922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4758897328667748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024071383936285823, + "kl": 0.0125579833984375, + "learning_rate": 1.2565643498535493e-05, + "loss": 0.0005, + "num_tokens": 1325944034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4760604250234702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012284553055569268, + "kl": 0.012542724609375, + "learning_rate": 1.2559884586099324e-05, + "loss": 0.0005, + "num_tokens": 1326515346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4762311171801656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002849894221092544, + "kl": 0.0124053955078125, + "learning_rate": 1.2554124764996539e-05, + "loss": 0.0005, + "num_tokens": 1327077490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.476401809336861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023529773142411583, + "kl": 0.0122528076171875, + "learning_rate": 1.2548364037271657e-05, + "loss": 0.0005, + "num_tokens": 1327638994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47657250149355634, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.704007404460586e-05, + "kl": 0.0125732421875, + "learning_rate": 1.254260240496953e-05, + "loss": 0.0005, + "num_tokens": 1328214194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47674319365025175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027222995984087505, + "kl": 0.012176513671875, + "learning_rate": 1.2536839870135337e-05, + "loss": 0.0005, + "num_tokens": 1328776658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47691388580694716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005121561592732912, + "kl": 0.0123443603515625, + "learning_rate": 1.2531076434814562e-05, + "loss": 0.0005, + "num_tokens": 1329342642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47708457796364256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002730288208767826, + "kl": 0.012420654296875, + "learning_rate": 1.2525312101053022e-05, + "loss": 0.0005, + "num_tokens": 1329904754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47725527012033797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031628615707537814, + "kl": 0.013092041015625, + "learning_rate": 1.2519546870896844e-05, + "loss": 0.0005, + "num_tokens": 1330468370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4774259622770334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023394812580385777, + "kl": 0.01202392578125, + "learning_rate": 1.2513780746392474e-05, + "loss": 0.0005, + "num_tokens": 1331046354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4775966544337288, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.630924072922636e-05, + "kl": 0.0128021240234375, + "learning_rate": 1.2508013729586686e-05, + "loss": 0.0005, + "num_tokens": 1331615202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4777673465904242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026611703149071034, + "kl": 0.012786865234375, + "learning_rate": 1.2502245822526553e-05, + "loss": 0.0005, + "num_tokens": 1332180146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47793803874711954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002162064136467627, + "kl": 0.012603759765625, + "learning_rate": 1.2496477027259482e-05, + "loss": 0.0005, + "num_tokens": 1332746402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47810873090381495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031609416181123206, + "kl": 0.0129241943359375, + "learning_rate": 1.2490707345833184e-05, + "loss": 0.0005, + "num_tokens": 1333310354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47827942306051036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002726874462996603, + "kl": 0.0130157470703125, + "learning_rate": 1.2484936780295687e-05, + "loss": 0.0005, + "num_tokens": 1333876434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47845011521720576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001748864477085827, + "kl": 0.0124359130859375, + "learning_rate": 1.2479165332695333e-05, + "loss": 0.0005, + "num_tokens": 1334443186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47862080737390117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002112016041478828, + "kl": 0.01214599609375, + "learning_rate": 1.247339300508078e-05, + "loss": 0.0005, + "num_tokens": 1335015106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4787914995305966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007880236215030157, + "kl": 0.0127410888671875, + "learning_rate": 1.2467619799500996e-05, + "loss": 0.0005, + "num_tokens": 1335585602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.478962191687292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045902474040568456, + "kl": 0.01214599609375, + "learning_rate": 1.2461845718005257e-05, + "loss": 0.0005, + "num_tokens": 1336149234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4791328838439874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031170388012567143, + "kl": 0.01275634765625, + "learning_rate": 1.2456070762643155e-05, + "loss": 0.0005, + "num_tokens": 1336712162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4793035760006828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005720424266406784, + "kl": 0.0118408203125, + "learning_rate": 1.2450294935464593e-05, + "loss": 0.0005, + "num_tokens": 1337272738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47947426815737815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013536372550736978, + "kl": 0.012603759765625, + "learning_rate": 1.244451823851978e-05, + "loss": 0.0005, + "num_tokens": 1337837202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47964496031407355, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.625823033019751e-05, + "kl": 0.0127105712890625, + "learning_rate": 1.2438740673859232e-05, + "loss": 0.0005, + "num_tokens": 1338406466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47981565247076896, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.616712612602197e-05, + "kl": 0.0126953125, + "learning_rate": 1.2432962243533777e-05, + "loss": 0.0005, + "num_tokens": 1338969810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47998634462746437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003245963810183467, + "kl": 0.01300048828125, + "learning_rate": 1.2427182949594555e-05, + "loss": 0.0005, + "num_tokens": 1339539538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4801570367841598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000608369540122229, + "kl": 0.0125732421875, + "learning_rate": 1.2421402794092998e-05, + "loss": 0.0005, + "num_tokens": 1340108114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4803277289408552, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.639357275312752e-05, + "kl": 0.01287841796875, + "learning_rate": 1.2415621779080862e-05, + "loss": 0.0005, + "num_tokens": 1340668754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4804984210975506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020467140019022443, + "kl": 0.0121917724609375, + "learning_rate": 1.2409839906610188e-05, + "loss": 0.0005, + "num_tokens": 1341231570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.480669113254246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014106119211418835, + "kl": 0.012237548828125, + "learning_rate": 1.2404057178733341e-05, + "loss": 0.0005, + "num_tokens": 1341798802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48083980541094135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001644793005392532, + "kl": 0.0120697021484375, + "learning_rate": 1.2398273597502974e-05, + "loss": 0.0005, + "num_tokens": 1342367170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48101049756763675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002641218410879123, + "kl": 0.0123291015625, + "learning_rate": 1.2392489164972057e-05, + "loss": 0.0005, + "num_tokens": 1342932146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48118118972433216, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.759039860155846e-05, + "kl": 0.0119781494140625, + "learning_rate": 1.2386703883193845e-05, + "loss": 0.0005, + "num_tokens": 1343500354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48135188188102757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006520707226551427, + "kl": 0.0121612548828125, + "learning_rate": 1.2380917754221914e-05, + "loss": 0.0005, + "num_tokens": 1344069554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.481522574037723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001280925399346751, + "kl": 0.0122222900390625, + "learning_rate": 1.2375130780110122e-05, + "loss": 0.0005, + "num_tokens": 1344637698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4816932661944184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010736496172124557, + "kl": 0.0119476318359375, + "learning_rate": 1.2369342962912643e-05, + "loss": 0.0005, + "num_tokens": 1345206882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4818639583511138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017710264410242596, + "kl": 0.0125885009765625, + "learning_rate": 1.236355430468394e-05, + "loss": 0.0005, + "num_tokens": 1345773394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4820346505078092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027160760404545156, + "kl": 0.0128021240234375, + "learning_rate": 1.235776480747878e-05, + "loss": 0.0005, + "num_tokens": 1346340834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48220534266450454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016948801618898128, + "kl": 0.0125885009765625, + "learning_rate": 1.2351974473352221e-05, + "loss": 0.0005, + "num_tokens": 1346901538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48237603482119995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002738798182287315, + "kl": 0.012786865234375, + "learning_rate": 1.234618330435963e-05, + "loss": 0.0005, + "num_tokens": 1347467986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48254672697789536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007220509491974338, + "kl": 0.012542724609375, + "learning_rate": 1.2340391302556654e-05, + "loss": 0.0005, + "num_tokens": 1348030434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48271741913459076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036406809464239407, + "kl": 0.012847900390625, + "learning_rate": 1.2334598469999248e-05, + "loss": 0.0005, + "num_tokens": 1348599090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48288811129128617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003492695239012413, + "kl": 0.0128021240234375, + "learning_rate": 1.2328804808743662e-05, + "loss": 0.0005, + "num_tokens": 1349165186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4830588034479816, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005692552381285726, + "kl": 0.0126495361328125, + "learning_rate": 1.2323010320846434e-05, + "loss": 0.0005, + "num_tokens": 1349727922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.483229495604677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000136537510125055, + "kl": 0.0126495361328125, + "learning_rate": 1.2317215008364397e-05, + "loss": 0.0005, + "num_tokens": 1350298130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4834001877613724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012964652304240095, + "kl": 0.01275634765625, + "learning_rate": 1.2311418873354679e-05, + "loss": 0.0005, + "num_tokens": 1350861282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48357087991806774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003197617557904487, + "kl": 0.0120391845703125, + "learning_rate": 1.2305621917874697e-05, + "loss": 0.0005, + "num_tokens": 1351424050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48374157207476315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047656303673165103, + "kl": 0.0126190185546875, + "learning_rate": 1.2299824143982165e-05, + "loss": 0.0005, + "num_tokens": 1352000914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48391226423145856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001964295241666741, + "kl": 0.01251220703125, + "learning_rate": 1.2294025553735078e-05, + "loss": 0.0005, + "num_tokens": 1352569330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48408295638815396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001092661156497906, + "kl": 0.0122222900390625, + "learning_rate": 1.2288226149191731e-05, + "loss": 0.0005, + "num_tokens": 1353136386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48425364854484937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004860897579796829, + "kl": 0.01324462890625, + "learning_rate": 1.22824259324107e-05, + "loss": 0.0005, + "num_tokens": 1353700194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4844243407015448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024962075859821277, + "kl": 0.0125885009765625, + "learning_rate": 1.2276624905450856e-05, + "loss": 0.0005, + "num_tokens": 1354264258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4845950328582402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023843040798126176, + "kl": 0.0126953125, + "learning_rate": 1.2270823070371349e-05, + "loss": 0.0005, + "num_tokens": 1354830610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4847657250149356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038314786245488986, + "kl": 0.0129241943359375, + "learning_rate": 1.2265020429231627e-05, + "loss": 0.0005, + "num_tokens": 1355394738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48493641717163094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012819958475844486, + "kl": 0.0126800537109375, + "learning_rate": 1.2259216984091411e-05, + "loss": 0.0005, + "num_tokens": 1355956594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48510710932832635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003728787152614908, + "kl": 0.012481689453125, + "learning_rate": 1.2253412737010723e-05, + "loss": 0.0005, + "num_tokens": 1356515250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48527780148502175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026339778866021674, + "kl": 0.0127410888671875, + "learning_rate": 1.2247607690049855e-05, + "loss": 0.0005, + "num_tokens": 1357082450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48544849364171716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034205302825958265, + "kl": 0.0120697021484375, + "learning_rate": 1.2241801845269392e-05, + "loss": 0.0005, + "num_tokens": 1357646242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48561918579841257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002689293597437284, + "kl": 0.0121307373046875, + "learning_rate": 1.2235995204730195e-05, + "loss": 0.0005, + "num_tokens": 1358205874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.485789877955108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001577432366548565, + "kl": 0.0125732421875, + "learning_rate": 1.223018777049342e-05, + "loss": 0.0005, + "num_tokens": 1358772402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4859605701118034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016773440043924847, + "kl": 0.0119476318359375, + "learning_rate": 1.2224379544620486e-05, + "loss": 0.0005, + "num_tokens": 1359338050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4861312622684988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006152644017005287, + "kl": 0.012115478515625, + "learning_rate": 1.2218570529173115e-05, + "loss": 0.0005, + "num_tokens": 1359901986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48630195442519414, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.965919886747933e-05, + "kl": 0.01287841796875, + "learning_rate": 1.2212760726213289e-05, + "loss": 0.0005, + "num_tokens": 1360463458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48647264658188955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003448044211685475, + "kl": 0.0127716064453125, + "learning_rate": 1.2206950137803282e-05, + "loss": 0.0005, + "num_tokens": 1361029650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48664333873858495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005511220163197581, + "kl": 0.0125274658203125, + "learning_rate": 1.220113876600564e-05, + "loss": 0.0005, + "num_tokens": 1361595986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48681403089528036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001854222001374531, + "kl": 0.0123748779296875, + "learning_rate": 1.2195326612883196e-05, + "loss": 0.0005, + "num_tokens": 1362158482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48698472305197577, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.364371887790553e-05, + "kl": 0.0128173828125, + "learning_rate": 1.218951368049905e-05, + "loss": 0.0005, + "num_tokens": 1362727874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48715541520867117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001926370340220239, + "kl": 0.013427734375, + "learning_rate": 1.2183699970916583e-05, + "loss": 0.0005, + "num_tokens": 1363292498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4873261073653666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003308320536758378, + "kl": 0.0122833251953125, + "learning_rate": 1.2177885486199454e-05, + "loss": 0.0005, + "num_tokens": 1363857874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.487496799522062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000391473826100124, + "kl": 0.0121612548828125, + "learning_rate": 1.2172070228411597e-05, + "loss": 0.0005, + "num_tokens": 1364423794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48766749167875734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006629864224168211, + "kl": 0.012969970703125, + "learning_rate": 1.2166254199617214e-05, + "loss": 0.0005, + "num_tokens": 1364988434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48783818383545274, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.987997172505066e-05, + "kl": 0.0120391845703125, + "learning_rate": 1.2160437401880786e-05, + "loss": 0.0005, + "num_tokens": 1365559522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48800887599214815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021051962078551156, + "kl": 0.0118560791015625, + "learning_rate": 1.215461983726707e-05, + "loss": 0.0005, + "num_tokens": 1366128354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48817956814884356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005598467557136464, + "kl": 0.0129547119140625, + "learning_rate": 1.2148801507841088e-05, + "loss": 0.0005, + "num_tokens": 1366693874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48835026030553896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001637857530065149, + "kl": 0.0122833251953125, + "learning_rate": 1.2142982415668139e-05, + "loss": 0.0005, + "num_tokens": 1367255698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48852095246223437, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.906213002250275e-05, + "kl": 0.0121917724609375, + "learning_rate": 1.2137162562813789e-05, + "loss": 0.0005, + "num_tokens": 1367822962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4886916446189298, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.132896254587e-05, + "kl": 0.0124053955078125, + "learning_rate": 1.2131341951343876e-05, + "loss": 0.0005, + "num_tokens": 1368392386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4888623367756252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004352053648626593, + "kl": 0.0125579833984375, + "learning_rate": 1.2125520583324508e-05, + "loss": 0.0005, + "num_tokens": 1368962146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48903302893232053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006457444133521297, + "kl": 0.0130767822265625, + "learning_rate": 1.211969846082206e-05, + "loss": 0.0005, + "num_tokens": 1369524882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48920372108901594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006640346144118648, + "kl": 0.01312255859375, + "learning_rate": 1.2113875585903173e-05, + "loss": 0.0005, + "num_tokens": 1370090658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48937441324571135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002588841324005849, + "kl": 0.0123291015625, + "learning_rate": 1.2108051960634761e-05, + "loss": 0.0005, + "num_tokens": 1370653154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48954510540240675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019471620731481695, + "kl": 0.0126800537109375, + "learning_rate": 1.2102227587084e-05, + "loss": 0.0005, + "num_tokens": 1371221330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48971579755910216, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.723516255174715e-05, + "kl": 0.0121002197265625, + "learning_rate": 1.2096402467318332e-05, + "loss": 0.0005, + "num_tokens": 1371784674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48988648971579757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029108449731008194, + "kl": 0.0125732421875, + "learning_rate": 1.2090576603405462e-05, + "loss": 0.0005, + "num_tokens": 1372351442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.490057181872493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038689966874032035, + "kl": 0.0125885009765625, + "learning_rate": 1.2084749997413363e-05, + "loss": 0.0005, + "num_tokens": 1372922658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4902278740291884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018739712388241982, + "kl": 0.0120697021484375, + "learning_rate": 1.2078922651410272e-05, + "loss": 0.0005, + "num_tokens": 1373502418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49039856618588373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024064422715303464, + "kl": 0.0128631591796875, + "learning_rate": 1.2073094567464683e-05, + "loss": 0.0005, + "num_tokens": 1374066674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49056925834257914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032967802579834965, + "kl": 0.0132904052734375, + "learning_rate": 1.206726574764536e-05, + "loss": 0.0005, + "num_tokens": 1374636130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49073995049927455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016446347761059674, + "kl": 0.0122222900390625, + "learning_rate": 1.2061436194021317e-05, + "loss": 0.0005, + "num_tokens": 1375200370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49091064265596995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012010056161393824, + "kl": 0.01239013671875, + "learning_rate": 1.205560590866184e-05, + "loss": 0.0005, + "num_tokens": 1375767138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49108133481266536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001887800085767941, + "kl": 0.0130157470703125, + "learning_rate": 1.2049774893636468e-05, + "loss": 0.0005, + "num_tokens": 1376330690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49125202696936077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004435789428850145, + "kl": 0.013275146484375, + "learning_rate": 1.2043943151015003e-05, + "loss": 0.0005, + "num_tokens": 1376904434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4914227191260562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002659251956544139, + "kl": 0.012359619140625, + "learning_rate": 1.20381106828675e-05, + "loss": 0.0005, + "num_tokens": 1377473554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4915934112827516, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.849849060648593e-05, + "kl": 0.012908935546875, + "learning_rate": 1.2032277491264277e-05, + "loss": 0.0005, + "num_tokens": 1378041394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49176410343944693, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.261453932614862e-05, + "kl": 0.0120849609375, + "learning_rate": 1.2026443578275903e-05, + "loss": 0.0005, + "num_tokens": 1378617746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49193479559614234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019469948638767783, + "kl": 0.01202392578125, + "learning_rate": 1.2020608945973216e-05, + "loss": 0.0005, + "num_tokens": 1379184338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49210548775283774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008372804552983483, + "kl": 0.0130767822265625, + "learning_rate": 1.2014773596427284e-05, + "loss": 0.0005, + "num_tokens": 1379749954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49227617990953315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019009416813034073, + "kl": 0.0125274658203125, + "learning_rate": 1.200893753170946e-05, + "loss": 0.0005, + "num_tokens": 1380317138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49244687206622856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003081863002448159, + "kl": 0.0123138427734375, + "learning_rate": 1.2003100753891328e-05, + "loss": 0.0005, + "num_tokens": 1380884258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49261756422292396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004910204679037304, + "kl": 0.0121002197265625, + "learning_rate": 1.1997263265044739e-05, + "loss": 0.0005, + "num_tokens": 1381444146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49278825637961937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003577668495738167, + "kl": 0.0123748779296875, + "learning_rate": 1.1991425067241787e-05, + "loss": 0.0005, + "num_tokens": 1382006978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4929589485363148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003156690580097894, + "kl": 0.0123443603515625, + "learning_rate": 1.198558616255482e-05, + "loss": 0.0005, + "num_tokens": 1382570770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49312964069301013, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.566387230060518e-05, + "kl": 0.0124053955078125, + "learning_rate": 1.1979746553056443e-05, + "loss": 0.0005, + "num_tokens": 1383141858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49330033284970554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030093084664319547, + "kl": 0.013519287109375, + "learning_rate": 1.1973906240819506e-05, + "loss": 0.0005, + "num_tokens": 1383709602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49347102500640094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029605338184699685, + "kl": 0.012451171875, + "learning_rate": 1.1968065227917105e-05, + "loss": 0.0005, + "num_tokens": 1384273282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49364171716309635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001330755480927821, + "kl": 0.0127105712890625, + "learning_rate": 1.1962223516422593e-05, + "loss": 0.0005, + "num_tokens": 1384837986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49381240931979176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000179456312241136, + "kl": 0.0128936767578125, + "learning_rate": 1.1956381108409563e-05, + "loss": 0.0005, + "num_tokens": 1385400498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49398310147648716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009805911603768288, + "kl": 0.012542724609375, + "learning_rate": 1.1950538005951862e-05, + "loss": 0.0005, + "num_tokens": 1385967634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49415379363318257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017175115148804684, + "kl": 0.0124053955078125, + "learning_rate": 1.1944694211123578e-05, + "loss": 0.0005, + "num_tokens": 1386538722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.494324485789878, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.395284075697211e-05, + "kl": 0.013092041015625, + "learning_rate": 1.1938849725999048e-05, + "loss": 0.0005, + "num_tokens": 1387105074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4944951779465733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039294395376507493, + "kl": 0.0128173828125, + "learning_rate": 1.1933004552652859e-05, + "loss": 0.0005, + "num_tokens": 1387669026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49466587010326873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005292867186352127, + "kl": 0.0124969482421875, + "learning_rate": 1.1927158693159826e-05, + "loss": 0.0005, + "num_tokens": 1388230834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49483656225996414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027091778281102257, + "kl": 0.012847900390625, + "learning_rate": 1.1921312149595024e-05, + "loss": 0.0005, + "num_tokens": 1388802594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49500725441665955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000340104600407764, + "kl": 0.0118255615234375, + "learning_rate": 1.1915464924033766e-05, + "loss": 0.0005, + "num_tokens": 1389363234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49517794657335495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012758422945211392, + "kl": 0.012939453125, + "learning_rate": 1.19096170185516e-05, + "loss": 0.0005, + "num_tokens": 1389931778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49534863873005036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015138424365257691, + "kl": 0.0124359130859375, + "learning_rate": 1.1903768435224333e-05, + "loss": 0.0005, + "num_tokens": 1390504354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49551933088674577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023594249039486588, + "kl": 0.01373291015625, + "learning_rate": 1.1897919176127986e-05, + "loss": 0.0005, + "num_tokens": 1391072466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4956900230434412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000255300527398897, + "kl": 0.0128326416015625, + "learning_rate": 1.1892069243338848e-05, + "loss": 0.0005, + "num_tokens": 1391638578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4958607152001366, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.01489685163695e-05, + "kl": 0.012603759765625, + "learning_rate": 1.1886218638933424e-05, + "loss": 0.0005, + "num_tokens": 1392217650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49603140735683193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003756369958468218, + "kl": 0.012451171875, + "learning_rate": 1.1880367364988476e-05, + "loss": 0.0005, + "num_tokens": 1392782370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49620209951352734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003970865623067359, + "kl": 0.01251220703125, + "learning_rate": 1.1874515423580988e-05, + "loss": 0.0005, + "num_tokens": 1393348754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49637279167022275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001264584660646504, + "kl": 0.012115478515625, + "learning_rate": 1.1868662816788192e-05, + "loss": 0.0005, + "num_tokens": 1393913522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49654348382691815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007245912560823321, + "kl": 0.0120697021484375, + "learning_rate": 1.1862809546687548e-05, + "loss": 0.0005, + "num_tokens": 1394479330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49671417598361356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012122222772890337, + "kl": 0.01275634765625, + "learning_rate": 1.1856955615356765e-05, + "loss": 0.0005, + "num_tokens": 1395042802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49688486814030897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005025885447827897, + "kl": 0.0123138427734375, + "learning_rate": 1.1851101024873763e-05, + "loss": 0.0005, + "num_tokens": 1395606594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4970555602970044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002875065459858436, + "kl": 0.0121612548828125, + "learning_rate": 1.1845245777316724e-05, + "loss": 0.0005, + "num_tokens": 1396173106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4972262524536998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005773257412063403, + "kl": 0.0123748779296875, + "learning_rate": 1.183938987476404e-05, + "loss": 0.0005, + "num_tokens": 1396746482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49739694461039513, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.868724352791409e-05, + "kl": 0.012420654296875, + "learning_rate": 1.183353331929435e-05, + "loss": 0.0005, + "num_tokens": 1397315602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49756763676709054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036101751507657085, + "kl": 0.011810302734375, + "learning_rate": 1.1827676112986519e-05, + "loss": 0.0005, + "num_tokens": 1397888082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49773832892378594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008932502130445159, + "kl": 0.0128021240234375, + "learning_rate": 1.1821818257919642e-05, + "loss": 0.0005, + "num_tokens": 1398459554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49790902108048135, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.946470897041313e-05, + "kl": 0.0125732421875, + "learning_rate": 1.1815959756173047e-05, + "loss": 0.0005, + "num_tokens": 1399028914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49807971323717676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003530016023217557, + "kl": 0.012664794921875, + "learning_rate": 1.1810100609826294e-05, + "loss": 0.0005, + "num_tokens": 1399597890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49825040539387216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020579914478498207, + "kl": 0.012451171875, + "learning_rate": 1.1804240820959163e-05, + "loss": 0.0005, + "num_tokens": 1400162994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49842109755056757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019708438696626468, + "kl": 0.0123138427734375, + "learning_rate": 1.1798380391651669e-05, + "loss": 0.0005, + "num_tokens": 1400724786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.498591789707263, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.090240960650272e-05, + "kl": 0.01239013671875, + "learning_rate": 1.1792519323984056e-05, + "loss": 0.0005, + "num_tokens": 1401287986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49876248186395833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011169807637162224, + "kl": 0.0125274658203125, + "learning_rate": 1.1786657620036788e-05, + "loss": 0.0005, + "num_tokens": 1401853186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49893317402065374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012524475725785033, + "kl": 0.012664794921875, + "learning_rate": 1.178079528189056e-05, + "loss": 0.0005, + "num_tokens": 1402426882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49910386617734914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001860742517006178, + "kl": 0.0128936767578125, + "learning_rate": 1.177493231162629e-05, + "loss": 0.0005, + "num_tokens": 1402994002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49927455833404455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024272610781793806, + "kl": 0.0125579833984375, + "learning_rate": 1.176906871132512e-05, + "loss": 0.0005, + "num_tokens": 1403564306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49944525049073996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000504657606267172, + "kl": 0.01251220703125, + "learning_rate": 1.1763204483068418e-05, + "loss": 0.0005, + "num_tokens": 1404126050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49961594264743536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003076273230237207, + "kl": 0.0118408203125, + "learning_rate": 1.1757339628937772e-05, + "loss": 0.0005, + "num_tokens": 1404694738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49978663480413077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003419413054528366, + "kl": 0.01226806640625, + "learning_rate": 1.1751474151014995e-05, + "loss": 0.0005, + "num_tokens": 1405258658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4999573269608262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017870238163206442, + "kl": 0.0124053955078125, + "learning_rate": 1.1745608051382118e-05, + "loss": 0.0005, + "num_tokens": 1405821714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5001280191175216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004190909875064885, + "kl": 0.0125885009765625, + "learning_rate": 1.1739741332121394e-05, + "loss": 0.0005, + "num_tokens": 1406393826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.500298711274217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001353069755408242, + "kl": 0.0127716064453125, + "learning_rate": 1.1733873995315301e-05, + "loss": 0.0005, + "num_tokens": 1406959234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5004694034309124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011022884480048924, + "kl": 0.012115478515625, + "learning_rate": 1.1728006043046528e-05, + "loss": 0.0005, + "num_tokens": 1407524338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5006400955876078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044136899110843944, + "kl": 0.01226806640625, + "learning_rate": 1.1722137477397983e-05, + "loss": 0.0005, + "num_tokens": 1408090114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5008107877443031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001524868100564273, + "kl": 0.0125732421875, + "learning_rate": 1.1716268300452807e-05, + "loss": 0.0005, + "num_tokens": 1408669266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5009814799009985, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.8447012577011885e-05, + "kl": 0.0127410888671875, + "learning_rate": 1.1710398514294333e-05, + "loss": 0.0005, + "num_tokens": 1409231410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5011521720576939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028559617668864743, + "kl": 0.0126190185546875, + "learning_rate": 1.1704528121006132e-05, + "loss": 0.0005, + "num_tokens": 1409795618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5013228642143893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014988511411512343, + "kl": 0.01190185546875, + "learning_rate": 1.1698657122671973e-05, + "loss": 0.0005, + "num_tokens": 1410357730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5014935563710847, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014573062891045928, + "kl": 0.0125732421875, + "learning_rate": 1.1692785521375859e-05, + "loss": 0.0005, + "num_tokens": 1410918674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5016642485277801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015820295801850903, + "kl": 0.012451171875, + "learning_rate": 1.1686913319201985e-05, + "loss": 0.0005, + "num_tokens": 1411478978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5018349406844755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001514537507931841, + "kl": 0.0122528076171875, + "learning_rate": 1.1681040518234781e-05, + "loss": 0.0005, + "num_tokens": 1412043026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.502005632841171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035375820141654254, + "kl": 0.0128173828125, + "learning_rate": 1.167516712055887e-05, + "loss": 0.0005, + "num_tokens": 1412609314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5021763249978664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006239827686251156, + "kl": 0.0125579833984375, + "learning_rate": 1.1669293128259107e-05, + "loss": 0.0005, + "num_tokens": 1413178978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5023470171545618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001407615530529556, + "kl": 0.012542724609375, + "learning_rate": 1.166341854342054e-05, + "loss": 0.0005, + "num_tokens": 1413747842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5025177093112572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003427338472186134, + "kl": 0.0126190185546875, + "learning_rate": 1.1657543368128434e-05, + "loss": 0.0005, + "num_tokens": 1414309186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5026884014679526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013297330187070022, + "kl": 0.0124359130859375, + "learning_rate": 1.1651667604468268e-05, + "loss": 0.0005, + "num_tokens": 1414875074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.502859093624648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002663872612962996, + "kl": 0.0128021240234375, + "learning_rate": 1.1645791254525723e-05, + "loss": 0.0005, + "num_tokens": 1415436738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5030297857813434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007575304643025174, + "kl": 0.0123138427734375, + "learning_rate": 1.1639914320386694e-05, + "loss": 0.0005, + "num_tokens": 1416008370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5032004779380388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002743139957516762, + "kl": 0.012359619140625, + "learning_rate": 1.1634036804137279e-05, + "loss": 0.0005, + "num_tokens": 1416575730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5033711700947342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004941583472789338, + "kl": 0.013031005859375, + "learning_rate": 1.162815870786378e-05, + "loss": 0.0005, + "num_tokens": 1417148370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5035418622514295, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.757469964864917e-05, + "kl": 0.0121307373046875, + "learning_rate": 1.1622280033652718e-05, + "loss": 0.0005, + "num_tokens": 1417718178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5037125544081249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006139152874808787, + "kl": 0.0130462646484375, + "learning_rate": 1.1616400783590802e-05, + "loss": 0.0005, + "num_tokens": 1418283858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5038832465648203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003579317381979768, + "kl": 0.0128936767578125, + "learning_rate": 1.1610520959764957e-05, + "loss": 0.0005, + "num_tokens": 1418852402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5040539387215157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012026217408532105, + "kl": 0.012725830078125, + "learning_rate": 1.160464056426231e-05, + "loss": 0.0005, + "num_tokens": 1419416994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5042246308782111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038090313280240697, + "kl": 0.0130462646484375, + "learning_rate": 1.1598759599170184e-05, + "loss": 0.0005, + "num_tokens": 1419983298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5043953230349065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021224570515387945, + "kl": 0.0124969482421875, + "learning_rate": 1.1592878066576112e-05, + "loss": 0.0005, + "num_tokens": 1420549746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5045660151916019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003131075221815362, + "kl": 0.012420654296875, + "learning_rate": 1.1586995968567827e-05, + "loss": 0.0005, + "num_tokens": 1421113554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5047367073482973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029521908393234723, + "kl": 0.012847900390625, + "learning_rate": 1.158111330723326e-05, + "loss": 0.0005, + "num_tokens": 1421680674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5049073995049927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009241256066452136, + "kl": 0.01312255859375, + "learning_rate": 1.1575230084660544e-05, + "loss": 0.0005, + "num_tokens": 1422245746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5050780916616882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002629351883242302, + "kl": 0.0124664306640625, + "learning_rate": 1.1569346302938012e-05, + "loss": 0.0005, + "num_tokens": 1422811810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5052487838183836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018866764516962584, + "kl": 0.0126953125, + "learning_rate": 1.1563461964154192e-05, + "loss": 0.0005, + "num_tokens": 1423378210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.505419475975079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007468559597983247, + "kl": 0.0123291015625, + "learning_rate": 1.1557577070397813e-05, + "loss": 0.0005, + "num_tokens": 1423943202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5055901681317744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020383502385062376, + "kl": 0.0124664306640625, + "learning_rate": 1.1551691623757797e-05, + "loss": 0.0005, + "num_tokens": 1424506994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5057608602884698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003652202167490639, + "kl": 0.012939453125, + "learning_rate": 1.1545805626323265e-05, + "loss": 0.0005, + "num_tokens": 1425075410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5059315524451652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001657706244821933, + "kl": 0.012420654296875, + "learning_rate": 1.1539919080183544e-05, + "loss": 0.0005, + "num_tokens": 1425634930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5061022446018606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001189034822997518, + "kl": 0.01275634765625, + "learning_rate": 1.153403198742813e-05, + "loss": 0.0005, + "num_tokens": 1426204866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5062729367585559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004196176650070573, + "kl": 0.0118255615234375, + "learning_rate": 1.152814435014674e-05, + "loss": 0.0005, + "num_tokens": 1426771282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5064436289152513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022263692083011627, + "kl": 0.01263427734375, + "learning_rate": 1.1522256170429268e-05, + "loss": 0.0005, + "num_tokens": 1427335202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5066143210719467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005277649167729892, + "kl": 0.01214599609375, + "learning_rate": 1.1516367450365804e-05, + "loss": 0.0005, + "num_tokens": 1427900098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5067850132286421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028136187393470297, + "kl": 0.0121612548828125, + "learning_rate": 1.1510478192046632e-05, + "loss": 0.0005, + "num_tokens": 1428463410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5069557053853375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039693167642425656, + "kl": 0.0128173828125, + "learning_rate": 1.1504588397562233e-05, + "loss": 0.0005, + "num_tokens": 1429033730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5071263975420329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044067199621812824, + "kl": 0.012786865234375, + "learning_rate": 1.1498698069003258e-05, + "loss": 0.0005, + "num_tokens": 1429604690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5072970896987283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008861802008390717, + "kl": 0.0127105712890625, + "learning_rate": 1.1492807208460575e-05, + "loss": 0.0005, + "num_tokens": 1430176274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5074677818554237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009254899064061403, + "kl": 0.0119781494140625, + "learning_rate": 1.1486915818025214e-05, + "loss": 0.0005, + "num_tokens": 1430740578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5076384740121191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001761251201541585, + "kl": 0.012237548828125, + "learning_rate": 1.1481023899788414e-05, + "loss": 0.0005, + "num_tokens": 1431305154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5078091661688146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024463085410570036, + "kl": 0.01239013671875, + "learning_rate": 1.1475131455841595e-05, + "loss": 0.0005, + "num_tokens": 1431871554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.50797985832551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010922227059498458, + "kl": 0.012451171875, + "learning_rate": 1.1469238488276355e-05, + "loss": 0.0005, + "num_tokens": 1432438002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5081505504822054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029358064248783024, + "kl": 0.0123748779296875, + "learning_rate": 1.1463344999184489e-05, + "loss": 0.0005, + "num_tokens": 1433002850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5083212426389008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003036098379358619, + "kl": 0.0134429931640625, + "learning_rate": 1.1457450990657973e-05, + "loss": 0.0005, + "num_tokens": 1433570626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5084919347955962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002731056156483177, + "kl": 0.012847900390625, + "learning_rate": 1.1451556464788964e-05, + "loss": 0.0005, + "num_tokens": 1434134354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5086626269522916, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.42634962227087e-05, + "kl": 0.0127105712890625, + "learning_rate": 1.1445661423669812e-05, + "loss": 0.0005, + "num_tokens": 1434700994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.508833319108987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034326015499203127, + "kl": 0.012786865234375, + "learning_rate": 1.1439765869393036e-05, + "loss": 0.0005, + "num_tokens": 1435264562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5090040112656823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011725463156192346, + "kl": 0.01214599609375, + "learning_rate": 1.1433869804051349e-05, + "loss": 0.0005, + "num_tokens": 1435827842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5091747034223777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002659536830348444, + "kl": 0.01214599609375, + "learning_rate": 1.1427973229737645e-05, + "loss": 0.0005, + "num_tokens": 1436393058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5093453955790731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011052206596496289, + "kl": 0.012054443359375, + "learning_rate": 1.142207614854499e-05, + "loss": 0.0005, + "num_tokens": 1436953506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5095160877357685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003421658318451239, + "kl": 0.01275634765625, + "learning_rate": 1.1416178562566633e-05, + "loss": 0.0005, + "num_tokens": 1437519938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5096867798924639, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.189937625189021e-05, + "kl": 0.0119171142578125, + "learning_rate": 1.1410280473896011e-05, + "loss": 0.0005, + "num_tokens": 1438082274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5098574720491593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018410231811387046, + "kl": 0.012725830078125, + "learning_rate": 1.1404381884626728e-05, + "loss": 0.0005, + "num_tokens": 1438643202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5100281642058547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003232013777366822, + "kl": 0.01251220703125, + "learning_rate": 1.1398482796852571e-05, + "loss": 0.0005, + "num_tokens": 1439203154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5101988563625501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031126418322335143, + "kl": 0.012176513671875, + "learning_rate": 1.1392583212667503e-05, + "loss": 0.0005, + "num_tokens": 1439772290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5103695485192455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041988121151587425, + "kl": 0.0129241943359375, + "learning_rate": 1.1386683134165666e-05, + "loss": 0.0005, + "num_tokens": 1440338706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.510540240675941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042605870024847107, + "kl": 0.012969970703125, + "learning_rate": 1.138078256344137e-05, + "loss": 0.0005, + "num_tokens": 1440902498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5107109328326364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045969175739927, + "kl": 0.0128021240234375, + "learning_rate": 1.1374881502589108e-05, + "loss": 0.0005, + "num_tokens": 1441461074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5108816249893318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014786329925405353, + "kl": 0.01239013671875, + "learning_rate": 1.136897995370354e-05, + "loss": 0.0005, + "num_tokens": 1442023698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5110523171460272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029294124053716155, + "kl": 0.012786865234375, + "learning_rate": 1.1363077918879512e-05, + "loss": 0.0005, + "num_tokens": 1442591986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5112230093027226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020087245956653765, + "kl": 0.011962890625, + "learning_rate": 1.1357175400212024e-05, + "loss": 0.0005, + "num_tokens": 1443159762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.511393701459418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038252390943970606, + "kl": 0.01251220703125, + "learning_rate": 1.1351272399796262e-05, + "loss": 0.0005, + "num_tokens": 1443730882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5115643936161134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005454373101963528, + "kl": 0.0129547119140625, + "learning_rate": 1.1345368919727574e-05, + "loss": 0.0005, + "num_tokens": 1444298562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5117350857728087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010599770715018484, + "kl": 0.01239013671875, + "learning_rate": 1.1339464962101486e-05, + "loss": 0.0005, + "num_tokens": 1444862914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5119057779295041, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.251050770046302e-05, + "kl": 0.01226806640625, + "learning_rate": 1.1333560529013685e-05, + "loss": 0.0005, + "num_tokens": 1445426626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5120764700861995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006328158844155325, + "kl": 0.0127410888671875, + "learning_rate": 1.1327655622560038e-05, + "loss": 0.0005, + "num_tokens": 1445993186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5122471622428949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005313953703402868, + "kl": 0.01251220703125, + "learning_rate": 1.132175024483657e-05, + "loss": 0.0005, + "num_tokens": 1446562322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5124178543995903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004017871642511911, + "kl": 0.0128631591796875, + "learning_rate": 1.131584439793948e-05, + "loss": 0.0005, + "num_tokens": 1447121234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5125885465562857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004981770798572636, + "kl": 0.0129547119140625, + "learning_rate": 1.1309938083965121e-05, + "loss": 0.0005, + "num_tokens": 1447686098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5127592387129811, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002433238038715755, + "kl": 0.012603759765625, + "learning_rate": 1.1304031305010037e-05, + "loss": 0.0005, + "num_tokens": 1448249746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5129299308696765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017002595361316478, + "kl": 0.012481689453125, + "learning_rate": 1.1298124063170906e-05, + "loss": 0.0005, + "num_tokens": 1448814482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5131006230263719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000956342809112216, + "kl": 0.0118255615234375, + "learning_rate": 1.1292216360544594e-05, + "loss": 0.0005, + "num_tokens": 1449378146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5132713151830673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021246265534706477, + "kl": 0.0125274658203125, + "learning_rate": 1.1286308199228122e-05, + "loss": 0.0005, + "num_tokens": 1449943186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5134420073397628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028217869320419945, + "kl": 0.0122528076171875, + "learning_rate": 1.1280399581318671e-05, + "loss": 0.0005, + "num_tokens": 1450507858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5136126994964582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011735724122943119, + "kl": 0.0125274658203125, + "learning_rate": 1.1274490508913588e-05, + "loss": 0.0005, + "num_tokens": 1451075026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5137833916531536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022258368119338416, + "kl": 0.012237548828125, + "learning_rate": 1.1268580984110382e-05, + "loss": 0.0005, + "num_tokens": 1451642482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.513954083809849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001488593286105561, + "kl": 0.01226806640625, + "learning_rate": 1.1262671009006719e-05, + "loss": 0.0005, + "num_tokens": 1452207730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5141247759665444, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.954115750917416e-05, + "kl": 0.0120697021484375, + "learning_rate": 1.1256760585700428e-05, + "loss": 0.0005, + "num_tokens": 1452771858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5142954681232398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018361555850187296, + "kl": 0.0127105712890625, + "learning_rate": 1.1250849716289497e-05, + "loss": 0.0005, + "num_tokens": 1453333346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5144661602799352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015783502536245505, + "kl": 0.01239013671875, + "learning_rate": 1.1244938402872065e-05, + "loss": 0.0005, + "num_tokens": 1453894786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5146368524366305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003245704951844875, + "kl": 0.0125732421875, + "learning_rate": 1.1239026647546442e-05, + "loss": 0.0005, + "num_tokens": 1454458946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5148075445933259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012305219526355743, + "kl": 0.0121307373046875, + "learning_rate": 1.1233114452411088e-05, + "loss": 0.0005, + "num_tokens": 1455028082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5149782367500213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030757085875236453, + "kl": 0.0124664306640625, + "learning_rate": 1.1227201819564615e-05, + "loss": 0.0005, + "num_tokens": 1455594018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5151489289067167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034624863048436584, + "kl": 0.0128936767578125, + "learning_rate": 1.1221288751105794e-05, + "loss": 0.0005, + "num_tokens": 1456155298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5153196210634121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010030339563791341, + "kl": 0.0121002197265625, + "learning_rate": 1.1215375249133553e-05, + "loss": 0.0005, + "num_tokens": 1456718306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5154903132201075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017380515374080578, + "kl": 0.0124053955078125, + "learning_rate": 1.1209461315746972e-05, + "loss": 0.0005, + "num_tokens": 1457280882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5156610053768029, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022569254820911465, + "kl": 0.0128936767578125, + "learning_rate": 1.1203546953045283e-05, + "loss": 0.0005, + "num_tokens": 1457838834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5158316975334983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004320728644563459, + "kl": 0.0125885009765625, + "learning_rate": 1.119763216312787e-05, + "loss": 0.0005, + "num_tokens": 1458413234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5160023896901937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000188568787617121, + "kl": 0.01275634765625, + "learning_rate": 1.119171694809427e-05, + "loss": 0.0005, + "num_tokens": 1458976226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5161730818468891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045738985591094693, + "kl": 0.012237548828125, + "learning_rate": 1.1185801310044176e-05, + "loss": 0.0005, + "num_tokens": 1459541746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5163437740035846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015747465630114166, + "kl": 0.0123291015625, + "learning_rate": 1.1179885251077418e-05, + "loss": 0.0005, + "num_tokens": 1460111330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.51651446616028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048180960914411415, + "kl": 0.01214599609375, + "learning_rate": 1.117396877329399e-05, + "loss": 0.0005, + "num_tokens": 1460676882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5166851583169754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004852903022849927, + "kl": 0.0126953125, + "learning_rate": 1.1168051878794021e-05, + "loss": 0.0005, + "num_tokens": 1461241746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5168558504736708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008108381410496035, + "kl": 0.01275634765625, + "learning_rate": 1.1162134569677805e-05, + "loss": 0.0005, + "num_tokens": 1461802754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5170265426303662, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.954647346557698e-05, + "kl": 0.012054443359375, + "learning_rate": 1.1156216848045764e-05, + "loss": 0.0005, + "num_tokens": 1462364130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5171972347870616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000168764939082573, + "kl": 0.012237548828125, + "learning_rate": 1.1150298715998482e-05, + "loss": 0.0005, + "num_tokens": 1462931554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5173679269437569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006166251440326814, + "kl": 0.011627197265625, + "learning_rate": 1.1144380175636672e-05, + "loss": 0.0005, + "num_tokens": 1463499442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5175386191004523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017713453377238267, + "kl": 0.012939453125, + "learning_rate": 1.113846122906122e-05, + "loss": 0.0005, + "num_tokens": 1464066066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5177093112571477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030944393877555253, + "kl": 0.0127410888671875, + "learning_rate": 1.113254187837312e-05, + "loss": 0.0005, + "num_tokens": 1464628978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5178800034138431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019090849230650664, + "kl": 0.0124969482421875, + "learning_rate": 1.1126622125673538e-05, + "loss": 0.0005, + "num_tokens": 1465196930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5180506955705385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002767657497411515, + "kl": 0.012481689453125, + "learning_rate": 1.1120701973063768e-05, + "loss": 0.0005, + "num_tokens": 1465762146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5182213877272339, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009252236834161664, + "kl": 0.013031005859375, + "learning_rate": 1.1114781422645255e-05, + "loss": 0.0005, + "num_tokens": 1466329058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5183920798839293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041852160164756097, + "kl": 0.0125885009765625, + "learning_rate": 1.110886047651958e-05, + "loss": 0.0005, + "num_tokens": 1466892930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5185627720406247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041697594205258984, + "kl": 0.0126190185546875, + "learning_rate": 1.110293913678846e-05, + "loss": 0.0005, + "num_tokens": 1467465874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5187334641973201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008933833074614509, + "kl": 0.012451171875, + "learning_rate": 1.109701740555376e-05, + "loss": 0.0005, + "num_tokens": 1468032642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5189041563540155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005020557072934966, + "kl": 0.0124359130859375, + "learning_rate": 1.1091095284917483e-05, + "loss": 0.0005, + "num_tokens": 1468597906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.519074848510711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015133990319372533, + "kl": 0.012725830078125, + "learning_rate": 1.1085172776981766e-05, + "loss": 0.0005, + "num_tokens": 1469161826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5192455406674064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004275145560523786, + "kl": 0.012908935546875, + "learning_rate": 1.1079249883848887e-05, + "loss": 0.0005, + "num_tokens": 1469729090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5194162328241018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015543589166239512, + "kl": 0.0126495361328125, + "learning_rate": 1.1073326607621255e-05, + "loss": 0.0005, + "num_tokens": 1470292178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5195869249807972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038674028623510495, + "kl": 0.012786865234375, + "learning_rate": 1.1067402950401422e-05, + "loss": 0.0005, + "num_tokens": 1470854914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5197576171374926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018335838191245004, + "kl": 0.0127716064453125, + "learning_rate": 1.1061478914292075e-05, + "loss": 0.0005, + "num_tokens": 1471426130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.519928309294188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006106076463934617, + "kl": 0.012969970703125, + "learning_rate": 1.1055554501396029e-05, + "loss": 0.0005, + "num_tokens": 1471987906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5200990014508833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040096570138102274, + "kl": 0.012481689453125, + "learning_rate": 1.104962971381624e-05, + "loss": 0.0005, + "num_tokens": 1472554482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5202696936075787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001320934519666805, + "kl": 0.0127410888671875, + "learning_rate": 1.104370455365579e-05, + "loss": 0.0005, + "num_tokens": 1473112098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5204403857642741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003964477284526694, + "kl": 0.0125579833984375, + "learning_rate": 1.10377790230179e-05, + "loss": 0.0005, + "num_tokens": 1473679010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5206110779209695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016549489162869438, + "kl": 0.012359619140625, + "learning_rate": 1.103185312400592e-05, + "loss": 0.0005, + "num_tokens": 1474245042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5207817700776649, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005196636956385463, + "kl": 0.012237548828125, + "learning_rate": 1.1025926858723327e-05, + "loss": 0.0005, + "num_tokens": 1474808274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5209524622343603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005261506441134112, + "kl": 0.0124969482421875, + "learning_rate": 1.1020000229273732e-05, + "loss": 0.0005, + "num_tokens": 1475383602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5211231543910557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001834810822466722, + "kl": 0.0123443603515625, + "learning_rate": 1.1014073237760879e-05, + "loss": 0.0005, + "num_tokens": 1475947426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5212938465477511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048013737806464486, + "kl": 0.01226806640625, + "learning_rate": 1.1008145886288629e-05, + "loss": 0.0005, + "num_tokens": 1476512434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5214645387044465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001760304286663538, + "kl": 0.0126800537109375, + "learning_rate": 1.1002218176960982e-05, + "loss": 0.0005, + "num_tokens": 1477076818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5216352308611419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026762815710632844, + "kl": 0.012451171875, + "learning_rate": 1.0996290111882062e-05, + "loss": 0.0005, + "num_tokens": 1477639234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5218059230178373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007191936044049278, + "kl": 0.0128173828125, + "learning_rate": 1.0990361693156115e-05, + "loss": 0.0005, + "num_tokens": 1478203938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5219766151745328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025914620782678305, + "kl": 0.013031005859375, + "learning_rate": 1.0984432922887516e-05, + "loss": 0.0005, + "num_tokens": 1478775810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5221473073312282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013353563499955687, + "kl": 0.01226806640625, + "learning_rate": 1.0978503803180767e-05, + "loss": 0.0005, + "num_tokens": 1479348850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5223179994879236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004164666522283071, + "kl": 0.0133819580078125, + "learning_rate": 1.097257433614049e-05, + "loss": 0.0005, + "num_tokens": 1479910818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.522488691644619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012737981325319458, + "kl": 0.012725830078125, + "learning_rate": 1.0966644523871431e-05, + "loss": 0.0005, + "num_tokens": 1480471330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5226593838013144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000347268786552226, + "kl": 0.0131072998046875, + "learning_rate": 1.0960714368478458e-05, + "loss": 0.0005, + "num_tokens": 1481038306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5228300759580097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005956100475063415, + "kl": 0.012969970703125, + "learning_rate": 1.0954783872066566e-05, + "loss": 0.0005, + "num_tokens": 1481600306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5230007681147051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004554488568901333, + "kl": 0.0126800537109375, + "learning_rate": 1.0948853036740865e-05, + "loss": 0.0005, + "num_tokens": 1482170066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5231714602714005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042667394105898906, + "kl": 0.0125732421875, + "learning_rate": 1.0942921864606588e-05, + "loss": 0.0005, + "num_tokens": 1482736722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5233421524280959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014039051029738482, + "kl": 0.0125274658203125, + "learning_rate": 1.0936990357769083e-05, + "loss": 0.0005, + "num_tokens": 1483298834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5235128445847913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003103704639465247, + "kl": 0.0119171142578125, + "learning_rate": 1.0931058518333831e-05, + "loss": 0.0005, + "num_tokens": 1483872114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5236835367414867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01507749610595934, + "kl": 0.014007568359375, + "learning_rate": 1.0925126348406407e-05, + "loss": 0.0006, + "num_tokens": 1484438226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5238542288981821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029026153406431555, + "kl": 0.012786865234375, + "learning_rate": 1.0919193850092531e-05, + "loss": 0.0005, + "num_tokens": 1485018274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5240249210548775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000248067799008542, + "kl": 0.012969970703125, + "learning_rate": 1.0913261025498014e-05, + "loss": 0.0005, + "num_tokens": 1485585954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5241956132115729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003823050385808283, + "kl": 0.0126190185546875, + "learning_rate": 1.0907327876728805e-05, + "loss": 0.0005, + "num_tokens": 1486150066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5243663053682683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003538905816904496, + "kl": 0.012725830078125, + "learning_rate": 1.0901394405890947e-05, + "loss": 0.0005, + "num_tokens": 1486718738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5245369975249637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012805417538631184, + "kl": 0.0128021240234375, + "learning_rate": 1.0895460615090618e-05, + "loss": 0.0005, + "num_tokens": 1487284514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5247076896816592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006902287350730303, + "kl": 0.01275634765625, + "learning_rate": 1.0889526506434092e-05, + "loss": 0.0005, + "num_tokens": 1487852386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5248783818383546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002843727551235963, + "kl": 0.0124053955078125, + "learning_rate": 1.0883592082027767e-05, + "loss": 0.0005, + "num_tokens": 1488415330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.52504907399505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038826882505892015, + "kl": 0.0126800537109375, + "learning_rate": 1.0877657343978147e-05, + "loss": 0.0005, + "num_tokens": 1488982098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5252197661517454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001423160928717013, + "kl": 0.012664794921875, + "learning_rate": 1.0871722294391854e-05, + "loss": 0.0005, + "num_tokens": 1489545298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5253904583084408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020490437947896304, + "kl": 0.0124359130859375, + "learning_rate": 1.0865786935375612e-05, + "loss": 0.0005, + "num_tokens": 1490112418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5255611504651361, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012393572735236648, + "kl": 0.012542724609375, + "learning_rate": 1.0859851269036263e-05, + "loss": 0.0005, + "num_tokens": 1490681730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5257318426218315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002752572163377476, + "kl": 0.0126190185546875, + "learning_rate": 1.0853915297480753e-05, + "loss": 0.0005, + "num_tokens": 1491244274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5259025347785269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045030018184743283, + "kl": 0.01300048828125, + "learning_rate": 1.0847979022816135e-05, + "loss": 0.0005, + "num_tokens": 1491807026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5260732269352223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002876864785234226, + "kl": 0.012420654296875, + "learning_rate": 1.0842042447149574e-05, + "loss": 0.0005, + "num_tokens": 1492371426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5262439190919177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008434737550397671, + "kl": 0.012603759765625, + "learning_rate": 1.0836105572588343e-05, + "loss": 0.0005, + "num_tokens": 1492933154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5264146112486131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008228040863944951, + "kl": 0.0125579833984375, + "learning_rate": 1.0830168401239811e-05, + "loss": 0.0005, + "num_tokens": 1493499746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5265853034053085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010030169985720588, + "kl": 0.01275634765625, + "learning_rate": 1.0824230935211468e-05, + "loss": 0.0005, + "num_tokens": 1494068834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5267559955620039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0225856172121138, + "kl": 0.023681640625, + "learning_rate": 1.0818293176610895e-05, + "loss": 0.0009, + "num_tokens": 1494665842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5269266877186993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003151691429877399, + "kl": 0.0130767822265625, + "learning_rate": 1.0812355127545781e-05, + "loss": 0.0005, + "num_tokens": 1495233890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5270973798753947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005304352986574955, + "kl": 0.0126953125, + "learning_rate": 1.0806416790123921e-05, + "loss": 0.0005, + "num_tokens": 1495798450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5272680720320901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007202051393508279, + "kl": 0.0125732421875, + "learning_rate": 1.0800478166453215e-05, + "loss": 0.0005, + "num_tokens": 1496369938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5274387641887855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036171682437712894, + "kl": 0.01275634765625, + "learning_rate": 1.0794539258641649e-05, + "loss": 0.0005, + "num_tokens": 1496937394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.527609456345481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003931642249885672, + "kl": 0.0128631591796875, + "learning_rate": 1.0788600068797333e-05, + "loss": 0.0005, + "num_tokens": 1497503698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5277801485021764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004935759566655801, + "kl": 0.0132293701171875, + "learning_rate": 1.0782660599028456e-05, + "loss": 0.0005, + "num_tokens": 1498067890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5279508406588718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003740341888907003, + "kl": 0.012420654296875, + "learning_rate": 1.077672085144332e-05, + "loss": 0.0005, + "num_tokens": 1498638386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5281215328155672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006393261460915689, + "kl": 0.0129547119140625, + "learning_rate": 1.077078082815032e-05, + "loss": 0.0005, + "num_tokens": 1499202498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5282922249722625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005176676591853805, + "kl": 0.0120697021484375, + "learning_rate": 1.076484053125795e-05, + "loss": 0.0005, + "num_tokens": 1499775154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5284629171289579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006957408097483948, + "kl": 0.0125579833984375, + "learning_rate": 1.0758899962874798e-05, + "loss": 0.0005, + "num_tokens": 1500339250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5286336092856533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004848644761111511, + "kl": 0.012725830078125, + "learning_rate": 1.0752959125109556e-05, + "loss": 0.0005, + "num_tokens": 1500904674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5288043014423487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008118099641921208, + "kl": 0.0130462646484375, + "learning_rate": 1.0747018020071005e-05, + "loss": 0.0005, + "num_tokens": 1501470978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5289749935990441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001012722273492248, + "kl": 0.0130157470703125, + "learning_rate": 1.0741076649868023e-05, + "loss": 0.0005, + "num_tokens": 1502039906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5291456857557395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005886386638349631, + "kl": 0.012451171875, + "learning_rate": 1.073513501660958e-05, + "loss": 0.0005, + "num_tokens": 1502609474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5293163779124349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028480857237880673, + "kl": 0.012664794921875, + "learning_rate": 1.0729193122404746e-05, + "loss": 0.0005, + "num_tokens": 1503203010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5294870700691303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004446679876860735, + "kl": 0.0125732421875, + "learning_rate": 1.0723250969362671e-05, + "loss": 0.0005, + "num_tokens": 1503770370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5296577622258257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006366926213039087, + "kl": 0.0131378173828125, + "learning_rate": 1.0717308559592615e-05, + "loss": 0.0005, + "num_tokens": 1504333298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5298284543825211, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016055344664648688, + "kl": 0.012542724609375, + "learning_rate": 1.0711365895203909e-05, + "loss": 0.0005, + "num_tokens": 1504894354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5299991465392165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003825379459850091, + "kl": 0.012908935546875, + "learning_rate": 1.0705422978305993e-05, + "loss": 0.0005, + "num_tokens": 1505465474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5301698386959119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000589891030817548, + "kl": 0.0125732421875, + "learning_rate": 1.069947981100838e-05, + "loss": 0.0005, + "num_tokens": 1506032322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5303405308526074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003838054048763486, + "kl": 0.0129241943359375, + "learning_rate": 1.069353639542069e-05, + "loss": 0.0005, + "num_tokens": 1506600066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5305112230093028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014803085769555285, + "kl": 0.0125732421875, + "learning_rate": 1.0687592733652607e-05, + "loss": 0.0005, + "num_tokens": 1507167634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5306819151659982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006737389231461434, + "kl": 0.012359619140625, + "learning_rate": 1.0681648827813928e-05, + "loss": 0.0005, + "num_tokens": 1507729090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5308526073226936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000657231022227408, + "kl": 0.0124053955078125, + "learning_rate": 1.0675704680014523e-05, + "loss": 0.0005, + "num_tokens": 1508298274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.531023299479389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001881270491883224, + "kl": 0.0123748779296875, + "learning_rate": 1.0669760292364346e-05, + "loss": 0.0005, + "num_tokens": 1508860930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5311939916360843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019336552365794993, + "kl": 0.012298583984375, + "learning_rate": 1.0663815666973443e-05, + "loss": 0.0005, + "num_tokens": 1509425618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5313646837927797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005170082723556298, + "kl": 0.0128021240234375, + "learning_rate": 1.065787080595194e-05, + "loss": 0.0005, + "num_tokens": 1509992994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5315353759494751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002839459095897499, + "kl": 0.012054443359375, + "learning_rate": 1.0651925711410049e-05, + "loss": 0.0005, + "num_tokens": 1510560322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5317060681061705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022936755934898332, + "kl": 0.0122833251953125, + "learning_rate": 1.0645980385458063e-05, + "loss": 0.0005, + "num_tokens": 1511128578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5318767602628659, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010362691796601212, + "kl": 0.0131988525390625, + "learning_rate": 1.0640034830206361e-05, + "loss": 0.0005, + "num_tokens": 1511694994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5320474524195613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011872541828172361, + "kl": 0.01251220703125, + "learning_rate": 1.0634089047765394e-05, + "loss": 0.0005, + "num_tokens": 1512256850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5322181445762567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003661826255185084, + "kl": 0.0124664306640625, + "learning_rate": 1.0628143040245708e-05, + "loss": 0.0005, + "num_tokens": 1512819074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5323888367329521, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039249974179968183, + "kl": 0.01251220703125, + "learning_rate": 1.0622196809757916e-05, + "loss": 0.0005, + "num_tokens": 1513388082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5325595288896475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001997897731847983, + "kl": 0.0120849609375, + "learning_rate": 1.0616250358412717e-05, + "loss": 0.0005, + "num_tokens": 1513956082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5327302210463429, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001132861471695408, + "kl": 0.0120849609375, + "learning_rate": 1.0610303688320886e-05, + "loss": 0.0005, + "num_tokens": 1514524530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5329009132030383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018969262632070893, + "kl": 0.0122528076171875, + "learning_rate": 1.0604356801593276e-05, + "loss": 0.0005, + "num_tokens": 1515099202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5330716053597337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003829265425126708, + "kl": 0.0127105712890625, + "learning_rate": 1.059840970034082e-05, + "loss": 0.0005, + "num_tokens": 1515663522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5332422975164292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004179778215007826, + "kl": 0.0131988525390625, + "learning_rate": 1.0592462386674518e-05, + "loss": 0.0005, + "num_tokens": 1516226194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5334129896731246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029900236536953783, + "kl": 0.0126800537109375, + "learning_rate": 1.0586514862705457e-05, + "loss": 0.0005, + "num_tokens": 1516792530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.53358368182982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016044740752930135, + "kl": 0.0127105712890625, + "learning_rate": 1.0580567130544792e-05, + "loss": 0.0005, + "num_tokens": 1517355842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5337543739865154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029887607544738935, + "kl": 0.012786865234375, + "learning_rate": 1.0574619192303752e-05, + "loss": 0.0005, + "num_tokens": 1517923778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5339250661432107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002666646466768017, + "kl": 0.0117950439453125, + "learning_rate": 1.056867105009364e-05, + "loss": 0.0005, + "num_tokens": 1518493826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5340957582999061, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024367882386452122, + "kl": 0.012542724609375, + "learning_rate": 1.0562722706025836e-05, + "loss": 0.0005, + "num_tokens": 1519064946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5342664504566015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028239248727098454, + "kl": 0.0128326416015625, + "learning_rate": 1.0556774162211782e-05, + "loss": 0.0005, + "num_tokens": 1519628690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5344371426132969, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.179834468819477e-05, + "kl": 0.0130462646484375, + "learning_rate": 1.0550825420763002e-05, + "loss": 0.0005, + "num_tokens": 1520194722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5346078347699923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027678402393238084, + "kl": 0.012969970703125, + "learning_rate": 1.0544876483791078e-05, + "loss": 0.0005, + "num_tokens": 1520759698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5347785269266877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020533955354579188, + "kl": 0.0118560791015625, + "learning_rate": 1.0538927353407672e-05, + "loss": 0.0005, + "num_tokens": 1521334402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5349492190833831, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006083507830264287, + "kl": 0.012725830078125, + "learning_rate": 1.053297803172451e-05, + "loss": 0.0005, + "num_tokens": 1521897810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5351199112400785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035410875569794775, + "kl": 0.012786865234375, + "learning_rate": 1.0527028520853388e-05, + "loss": 0.0005, + "num_tokens": 1522456322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5352906033967739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028485311876697244, + "kl": 0.0126953125, + "learning_rate": 1.0521078822906164e-05, + "loss": 0.0005, + "num_tokens": 1523027010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5354612955534693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002447878519679259, + "kl": 0.012298583984375, + "learning_rate": 1.051512893999477e-05, + "loss": 0.0005, + "num_tokens": 1523587970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5356319877101647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005177398143381088, + "kl": 0.013153076171875, + "learning_rate": 1.0509178874231191e-05, + "loss": 0.0005, + "num_tokens": 1524150466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5358026798668601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001362363154028091, + "kl": 0.012481689453125, + "learning_rate": 1.0503228627727497e-05, + "loss": 0.0005, + "num_tokens": 1524713986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5359733720235555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038190647991394664, + "kl": 0.012908935546875, + "learning_rate": 1.0497278202595804e-05, + "loss": 0.0005, + "num_tokens": 1525281426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.536144064180251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003224218879263348, + "kl": 0.0123138427734375, + "learning_rate": 1.0491327600948302e-05, + "loss": 0.0005, + "num_tokens": 1525852754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5363147563369464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015724251688602692, + "kl": 0.01287841796875, + "learning_rate": 1.0485376824897236e-05, + "loss": 0.0005, + "num_tokens": 1526414562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5364854484936418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021105997991630206, + "kl": 0.01275634765625, + "learning_rate": 1.0479425876554919e-05, + "loss": 0.0005, + "num_tokens": 1526981954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5366561406503371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029827876421789276, + "kl": 0.0124969482421875, + "learning_rate": 1.0473474758033721e-05, + "loss": 0.0005, + "num_tokens": 1527544962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5368268328070325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008931873278175347, + "kl": 0.0126495361328125, + "learning_rate": 1.0467523471446077e-05, + "loss": 0.0005, + "num_tokens": 1528106834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5369975249637279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003450418949099396, + "kl": 0.0121612548828125, + "learning_rate": 1.0461572018904476e-05, + "loss": 0.0005, + "num_tokens": 1528669106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5371682171204233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018746057214010807, + "kl": 0.0126800537109375, + "learning_rate": 1.045562040252147e-05, + "loss": 0.0005, + "num_tokens": 1529235154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5373389092771187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003517103415692858, + "kl": 0.012176513671875, + "learning_rate": 1.0449668624409668e-05, + "loss": 0.0005, + "num_tokens": 1529800914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5375096014338141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005592623659777259, + "kl": 0.0124969482421875, + "learning_rate": 1.0443716686681738e-05, + "loss": 0.0005, + "num_tokens": 1530365346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5376802935905095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004889469157843845, + "kl": 0.012237548828125, + "learning_rate": 1.0437764591450398e-05, + "loss": 0.0005, + "num_tokens": 1530930098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5378509857472049, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028083585160454703, + "kl": 0.01336669921875, + "learning_rate": 1.0431812340828433e-05, + "loss": 0.0005, + "num_tokens": 1531495570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5380216779039003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000285188454004457, + "kl": 0.011932373046875, + "learning_rate": 1.0425859936928675e-05, + "loss": 0.0005, + "num_tokens": 1532068098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5381923700605957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001287370236879498, + "kl": 0.012603759765625, + "learning_rate": 1.0419907381864012e-05, + "loss": 0.0005, + "num_tokens": 1532628098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5383630622172911, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018994414458624537, + "kl": 0.0125274658203125, + "learning_rate": 1.0413954677747385e-05, + "loss": 0.0005, + "num_tokens": 1533192434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5385337543739865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040336172503181386, + "kl": 0.01239013671875, + "learning_rate": 1.0408001826691792e-05, + "loss": 0.0005, + "num_tokens": 1533759218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.538704446530682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038043020233221653, + "kl": 0.0124053955078125, + "learning_rate": 1.0402048830810277e-05, + "loss": 0.0005, + "num_tokens": 1534323954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5388751386873774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002945203144365393, + "kl": 0.0121612548828125, + "learning_rate": 1.039609569221594e-05, + "loss": 0.0005, + "num_tokens": 1534891378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5390458308440728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003133614167942367, + "kl": 0.012298583984375, + "learning_rate": 1.0390142413021929e-05, + "loss": 0.0005, + "num_tokens": 1535458882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5392165230007682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001974605690091995, + "kl": 0.0131378173828125, + "learning_rate": 1.0384188995341449e-05, + "loss": 0.0005, + "num_tokens": 1536033954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5393872151574635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030735042159232243, + "kl": 0.01202392578125, + "learning_rate": 1.037823544128774e-05, + "loss": 0.0005, + "num_tokens": 1536601314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5395579073141589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012945172736068328, + "kl": 0.012451171875, + "learning_rate": 1.0372281752974108e-05, + "loss": 0.0005, + "num_tokens": 1537163506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5397285994708543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033904535072323523, + "kl": 0.0120391845703125, + "learning_rate": 1.0366327932513886e-05, + "loss": 0.0005, + "num_tokens": 1537741298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5398992916275497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020650396177793615, + "kl": 0.0125579833984375, + "learning_rate": 1.0360373982020478e-05, + "loss": 0.0005, + "num_tokens": 1538308546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5400699837842451, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.302996406490146, + "kl": 0.22607421875, + "learning_rate": 1.0354419903607308e-05, + "loss": 0.009, + "num_tokens": 1538903730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5402406759409405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005010126584726345, + "kl": 0.0134735107421875, + "learning_rate": 1.0348465699387873e-05, + "loss": 0.0005, + "num_tokens": 1539466130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5404113680976359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014103751392710064, + "kl": 0.0130462646484375, + "learning_rate": 1.0342511371475687e-05, + "loss": 0.0005, + "num_tokens": 1540024194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5405820602543313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025092151295491127, + "kl": 0.013031005859375, + "learning_rate": 1.0336556921984333e-05, + "loss": 0.0005, + "num_tokens": 1540588290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5407527524110267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000973603954652138, + "kl": 0.012786865234375, + "learning_rate": 1.033060235302742e-05, + "loss": 0.0005, + "num_tokens": 1541153282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5409234445677221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004919754194865141, + "kl": 0.01312255859375, + "learning_rate": 1.0324647666718604e-05, + "loss": 0.0005, + "num_tokens": 1541720482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5410941367244175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001244958328908337, + "kl": 0.012054443359375, + "learning_rate": 1.0318692865171585e-05, + "loss": 0.0005, + "num_tokens": 1542294626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5412648288811129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007008624825524395, + "kl": 0.0123748779296875, + "learning_rate": 1.0312737950500105e-05, + "loss": 0.0005, + "num_tokens": 1542862274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5414355210378083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007486747432479395, + "kl": 0.0122833251953125, + "learning_rate": 1.0306782924817941e-05, + "loss": 0.0005, + "num_tokens": 1543428018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5416062131945037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001915801703859636, + "kl": 0.01263427734375, + "learning_rate": 1.0300827790238914e-05, + "loss": 0.0005, + "num_tokens": 1543996530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5417769053511992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008965116240130671, + "kl": 0.0137786865234375, + "learning_rate": 1.029487254887688e-05, + "loss": 0.0006, + "num_tokens": 1544569250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5419475975078946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002900381577852313, + "kl": 0.013092041015625, + "learning_rate": 1.0288917202845735e-05, + "loss": 0.0005, + "num_tokens": 1545142354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5421182896645899, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007778583274568512, + "kl": 0.0132904052734375, + "learning_rate": 1.0282961754259414e-05, + "loss": 0.0005, + "num_tokens": 1545703058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5422889818212853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008775207157225127, + "kl": 0.0123443603515625, + "learning_rate": 1.0277006205231882e-05, + "loss": 0.0005, + "num_tokens": 1546265922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5424596739779807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007905040570774601, + "kl": 0.0129241943359375, + "learning_rate": 1.0271050557877147e-05, + "loss": 0.0005, + "num_tokens": 1546832706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5426303661346761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000633702085030384, + "kl": 0.0127716064453125, + "learning_rate": 1.0265094814309249e-05, + "loss": 0.0005, + "num_tokens": 1547397410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5428010582913715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002478858307166785, + "kl": 0.0127105712890625, + "learning_rate": 1.025913897664226e-05, + "loss": 0.0005, + "num_tokens": 1547962194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5429717504480669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010108829489210676, + "kl": 0.01287841796875, + "learning_rate": 1.0253183046990292e-05, + "loss": 0.0005, + "num_tokens": 1548533602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5431424426047623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027039793271399623, + "kl": 0.013427734375, + "learning_rate": 1.0247227027467479e-05, + "loss": 0.0005, + "num_tokens": 1549093938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5433131347614577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014170535773221996, + "kl": 0.0124053955078125, + "learning_rate": 1.0241270920187996e-05, + "loss": 0.0005, + "num_tokens": 1549662194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5434838269181531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01213989775575043, + "kl": 0.0144805908203125, + "learning_rate": 1.0235314727266046e-05, + "loss": 0.0006, + "num_tokens": 1550231362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5436545190748485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001170785116949109, + "kl": 0.0131072998046875, + "learning_rate": 1.0229358450815864e-05, + "loss": 0.0005, + "num_tokens": 1550798098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5438252112315439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008723631537662573, + "kl": 0.0129547119140625, + "learning_rate": 1.0223402092951707e-05, + "loss": 0.0005, + "num_tokens": 1551362770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5439959033882393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009573003069249841, + "kl": 0.01336669921875, + "learning_rate": 1.0217445655787874e-05, + "loss": 0.0005, + "num_tokens": 1551929570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5441665955449347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011374894469499334, + "kl": 0.01275634765625, + "learning_rate": 1.021148914143868e-05, + "loss": 0.0005, + "num_tokens": 1552497538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5443372877016301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009332093318925368, + "kl": 0.0128936767578125, + "learning_rate": 1.020553255201848e-05, + "loss": 0.0005, + "num_tokens": 1553065906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5445079798583256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006274134854895833, + "kl": 0.0136871337890625, + "learning_rate": 1.0199575889641638e-05, + "loss": 0.0005, + "num_tokens": 1553632146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.544678672015021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006378156039645742, + "kl": 0.013916015625, + "learning_rate": 1.0193619156422566e-05, + "loss": 0.0006, + "num_tokens": 1554202466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5448493641717163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009315420405949084, + "kl": 0.0132293701171875, + "learning_rate": 1.0187662354475675e-05, + "loss": 0.0005, + "num_tokens": 1554772306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5450200563284117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027420616327145965, + "kl": 0.0136566162109375, + "learning_rate": 1.0181705485915429e-05, + "loss": 0.0005, + "num_tokens": 1555334706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5451907484851071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001302190097410184, + "kl": 0.0129852294921875, + "learning_rate": 1.017574855285629e-05, + "loss": 0.0005, + "num_tokens": 1555904274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5453614406418025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008163298989136855, + "kl": 0.0132293701171875, + "learning_rate": 1.0169791557412763e-05, + "loss": 0.0005, + "num_tokens": 1556474898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5455321327984979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017338665694613426, + "kl": 0.0127716064453125, + "learning_rate": 1.0163834501699358e-05, + "loss": 0.0005, + "num_tokens": 1557040418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5457028249551933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010987881454091685, + "kl": 0.0131988525390625, + "learning_rate": 1.0157877387830623e-05, + "loss": 0.0005, + "num_tokens": 1557602418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5458735171118887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010762203721415048, + "kl": 0.0133056640625, + "learning_rate": 1.0151920217921111e-05, + "loss": 0.0005, + "num_tokens": 1558170530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5460442092685841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023308920986509936, + "kl": 0.0127410888671875, + "learning_rate": 1.0145962994085407e-05, + "loss": 0.0005, + "num_tokens": 1558732018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5462149014252795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008527180215467562, + "kl": 0.0126953125, + "learning_rate": 1.014000571843811e-05, + "loss": 0.0005, + "num_tokens": 1559295746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5463855935819749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006572267440492243, + "kl": 0.01318359375, + "learning_rate": 1.0134048393093836e-05, + "loss": 0.0005, + "num_tokens": 1559859858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5465562857386703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005011450924931122, + "kl": 0.0133514404296875, + "learning_rate": 1.0128091020167222e-05, + "loss": 0.0005, + "num_tokens": 1560427090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5467269778953657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006120566323284789, + "kl": 0.012725830078125, + "learning_rate": 1.0122133601772919e-05, + "loss": 0.0005, + "num_tokens": 1560996098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5468976700520611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010355444447644074, + "kl": 0.0137939453125, + "learning_rate": 1.0116176140025596e-05, + "loss": 0.0006, + "num_tokens": 1561556386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5470683622087565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002763767658222766, + "kl": 0.0135345458984375, + "learning_rate": 1.0110218637039937e-05, + "loss": 0.0005, + "num_tokens": 1562117618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.547239054365452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005735580052352193, + "kl": 0.0130462646484375, + "learning_rate": 1.010426109493064e-05, + "loss": 0.0005, + "num_tokens": 1562689362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5474097465221474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007378862967029726, + "kl": 0.0133209228515625, + "learning_rate": 1.009830351581242e-05, + "loss": 0.0005, + "num_tokens": 1563257090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5475804386788428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007540672142269301, + "kl": 0.013214111328125, + "learning_rate": 1.0092345901799998e-05, + "loss": 0.0005, + "num_tokens": 1563826082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5477511308355381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003988137737692272, + "kl": 0.0128936767578125, + "learning_rate": 1.0086388255008115e-05, + "loss": 0.0005, + "num_tokens": 1564390018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5479218229922335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005267606828788758, + "kl": 0.013092041015625, + "learning_rate": 1.008043057755152e-05, + "loss": 0.0005, + "num_tokens": 1564952130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5480925151489289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011194382570569078, + "kl": 0.012786865234375, + "learning_rate": 1.0074472871544973e-05, + "loss": 0.0005, + "num_tokens": 1565516306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5482632073056243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005798370398993218, + "kl": 0.0125579833984375, + "learning_rate": 1.0068515139103248e-05, + "loss": 0.0005, + "num_tokens": 1566077010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5484338994623197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004446435948687176, + "kl": 0.0136260986328125, + "learning_rate": 1.0062557382341118e-05, + "loss": 0.0005, + "num_tokens": 1566640242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5486045916190151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039102239802500075, + "kl": 0.0127716064453125, + "learning_rate": 1.0056599603373378e-05, + "loss": 0.0005, + "num_tokens": 1567201778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5487752837757105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006035643650008798, + "kl": 0.01324462890625, + "learning_rate": 1.005064180431482e-05, + "loss": 0.0005, + "num_tokens": 1567765154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5489459759324059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007118850768456797, + "kl": 0.0126190185546875, + "learning_rate": 1.004468398728025e-05, + "loss": 0.0005, + "num_tokens": 1568328786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5491166680891013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006506504664966035, + "kl": 0.012664794921875, + "learning_rate": 1.003872615438448e-05, + "loss": 0.0005, + "num_tokens": 1568903762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5492873602457967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006575164069833928, + "kl": 0.012847900390625, + "learning_rate": 1.0032768307742317e-05, + "loss": 0.0005, + "num_tokens": 1569466898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5494580524024921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000414227152753616, + "kl": 0.0127410888671875, + "learning_rate": 1.0026810449468595e-05, + "loss": 0.0005, + "num_tokens": 1570030530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5496287445591875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000728483845265656, + "kl": 0.013275146484375, + "learning_rate": 1.0020852581678126e-05, + "loss": 0.0005, + "num_tokens": 1570595842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5497994367158829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006164718457962555, + "kl": 0.013427734375, + "learning_rate": 1.0014894706485748e-05, + "loss": 0.0005, + "num_tokens": 1571160082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5499701288725783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008517398118691009, + "kl": 0.012451171875, + "learning_rate": 1.0008936826006281e-05, + "loss": 0.0005, + "num_tokens": 1571726450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5501408210292738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005657381963700539, + "kl": 0.013671875, + "learning_rate": 1.000297894235457e-05, + "loss": 0.0005, + "num_tokens": 1572289842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5503115131859692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026517044129434586, + "kl": 0.0131072998046875, + "learning_rate": 9.997021057645434e-06, + "loss": 0.0005, + "num_tokens": 1572854162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5504822053426645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007255115877612272, + "kl": 0.0128326416015625, + "learning_rate": 9.99106317399372e-06, + "loss": 0.0005, + "num_tokens": 1573416946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5506528974993599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004788568887383212, + "kl": 0.013397216796875, + "learning_rate": 9.985105293514257e-06, + "loss": 0.0005, + "num_tokens": 1573983026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5508235896560553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013976415478750507, + "kl": 0.0128936767578125, + "learning_rate": 9.979147418321877e-06, + "loss": 0.0005, + "num_tokens": 1574549314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5509942818127507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003280288797256998, + "kl": 0.0128631591796875, + "learning_rate": 9.973189550531408e-06, + "loss": 0.0005, + "num_tokens": 1575113634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5511649739694461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006601852849560737, + "kl": 0.0127410888671875, + "learning_rate": 9.967231692257683e-06, + "loss": 0.0005, + "num_tokens": 1575674850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5513356661261415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011545000009206522, + "kl": 0.01318359375, + "learning_rate": 9.961273845615526e-06, + "loss": 0.0005, + "num_tokens": 1576242258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5515063582828369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009425177992460966, + "kl": 0.0127716064453125, + "learning_rate": 9.955316012719753e-06, + "loss": 0.0005, + "num_tokens": 1576803954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5516770504395323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033771213391832933, + "kl": 0.0126190185546875, + "learning_rate": 9.949358195685182e-06, + "loss": 0.0005, + "num_tokens": 1577370994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5518477425962277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040683382270590156, + "kl": 0.012420654296875, + "learning_rate": 9.943400396626625e-06, + "loss": 0.0005, + "num_tokens": 1577938354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5520184347529231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031301610058879966, + "kl": 0.0131683349609375, + "learning_rate": 9.937442617658887e-06, + "loss": 0.0005, + "num_tokens": 1578504690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5521891269096185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025190788774257194, + "kl": 0.0137939453125, + "learning_rate": 9.931484860896755e-06, + "loss": 0.0006, + "num_tokens": 1579073186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5523598190663139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008395882529006649, + "kl": 0.0127716064453125, + "learning_rate": 9.925527128455029e-06, + "loss": 0.0005, + "num_tokens": 1579639858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5525305112230093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008402570254385007, + "kl": 0.0128631591796875, + "learning_rate": 9.91956942244848e-06, + "loss": 0.0005, + "num_tokens": 1580204466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5527012033797047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007440359561010449, + "kl": 0.012420654296875, + "learning_rate": 9.91361174499189e-06, + "loss": 0.0005, + "num_tokens": 1580766738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5528718955364001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004497652678119959, + "kl": 0.0130462646484375, + "learning_rate": 9.907654098200005e-06, + "loss": 0.0005, + "num_tokens": 1581328258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5530425876930956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045586017241567683, + "kl": 0.013397216796875, + "learning_rate": 9.901696484187585e-06, + "loss": 0.0005, + "num_tokens": 1581894914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5532132798497909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005964474331403606, + "kl": 0.0124053955078125, + "learning_rate": 9.895738905069361e-06, + "loss": 0.0005, + "num_tokens": 1582453874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5533839720064863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004994596376995464, + "kl": 0.0129852294921875, + "learning_rate": 9.889781362960067e-06, + "loss": 0.0005, + "num_tokens": 1583022642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5535546641631817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004462232731121674, + "kl": 0.012451171875, + "learning_rate": 9.883823859974408e-06, + "loss": 0.0005, + "num_tokens": 1583587618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5537253563198771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011004730629731558, + "kl": 0.012786865234375, + "learning_rate": 9.877866398227085e-06, + "loss": 0.0005, + "num_tokens": 1584153538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5538960484765725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038028695555799744, + "kl": 0.0125732421875, + "learning_rate": 9.87190897983278e-06, + "loss": 0.0005, + "num_tokens": 1584720434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5540667406332679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003513019435486358, + "kl": 0.013092041015625, + "learning_rate": 9.865951606906169e-06, + "loss": 0.0005, + "num_tokens": 1585284018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5542374327899633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004993014877177313, + "kl": 0.012664794921875, + "learning_rate": 9.859994281561892e-06, + "loss": 0.0005, + "num_tokens": 1585843890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5544081249466587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023999739814108153, + "kl": 0.0123748779296875, + "learning_rate": 9.854037005914596e-06, + "loss": 0.0005, + "num_tokens": 1586410818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5545788171033541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041774389074766165, + "kl": 0.012969970703125, + "learning_rate": 9.84807978207889e-06, + "loss": 0.0005, + "num_tokens": 1586972850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5547495092600495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001257794292688964, + "kl": 0.012115478515625, + "learning_rate": 9.842122612169382e-06, + "loss": 0.0005, + "num_tokens": 1587536162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5549202014167449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002569625723750175, + "kl": 0.0131988525390625, + "learning_rate": 9.836165498300645e-06, + "loss": 0.0005, + "num_tokens": 1588103714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5550908935734403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014625038084109532, + "kl": 0.0128021240234375, + "learning_rate": 9.830208442587239e-06, + "loss": 0.0005, + "num_tokens": 1588665218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5552615857301357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001832695533111286, + "kl": 0.0130157470703125, + "learning_rate": 9.82425144714371e-06, + "loss": 0.0005, + "num_tokens": 1589230306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5554322778868311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007187917730627106, + "kl": 0.01348876953125, + "learning_rate": 9.818294514084576e-06, + "loss": 0.0005, + "num_tokens": 1589810498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5556029700435265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010868034429352977, + "kl": 0.0135498046875, + "learning_rate": 9.812337645524326e-06, + "loss": 0.0005, + "num_tokens": 1590380690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.555773662200222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008237440103737609, + "kl": 0.0124053955078125, + "learning_rate": 9.806380843577438e-06, + "loss": 0.0005, + "num_tokens": 1590950978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5559443543569172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001475974882317469, + "kl": 0.013275146484375, + "learning_rate": 9.800424110358362e-06, + "loss": 0.0005, + "num_tokens": 1591515874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5561150465136127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007383419799895736, + "kl": 0.0130615234375, + "learning_rate": 9.794467447981525e-06, + "loss": 0.0005, + "num_tokens": 1592079202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5562857386703081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000907858028799187, + "kl": 0.013763427734375, + "learning_rate": 9.788510858561323e-06, + "loss": 0.0006, + "num_tokens": 1592642338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5564564308270035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005503765008603697, + "kl": 0.0125732421875, + "learning_rate": 9.782554344212129e-06, + "loss": 0.0005, + "num_tokens": 1593205106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5566271229836989, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020727181936339682, + "kl": 0.0125732421875, + "learning_rate": 9.776597907048295e-06, + "loss": 0.0005, + "num_tokens": 1593777586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5567978151403943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021203223072721535, + "kl": 0.013031005859375, + "learning_rate": 9.770641549184143e-06, + "loss": 0.0005, + "num_tokens": 1594347506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5569685072970897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011034820289588406, + "kl": 0.01251220703125, + "learning_rate": 9.764685272733957e-06, + "loss": 0.0005, + "num_tokens": 1594914738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5571391994537851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005729379759385128, + "kl": 0.0130462646484375, + "learning_rate": 9.758729079812008e-06, + "loss": 0.0005, + "num_tokens": 1595481970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5573098916104805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013931951044131904, + "kl": 0.01287841796875, + "learning_rate": 9.752772972532524e-06, + "loss": 0.0005, + "num_tokens": 1596045794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5574805837671759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000815361438967299, + "kl": 0.0132293701171875, + "learning_rate": 9.746816953009715e-06, + "loss": 0.0005, + "num_tokens": 1596607282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5576512759238713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036453009692885923, + "kl": 0.0133819580078125, + "learning_rate": 9.740861023357742e-06, + "loss": 0.0005, + "num_tokens": 1597172882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5578219680805667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000522554624780081, + "kl": 0.0133209228515625, + "learning_rate": 9.734905185690755e-06, + "loss": 0.0005, + "num_tokens": 1597741666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5579926602372621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010015929892010981, + "kl": 0.012298583984375, + "learning_rate": 9.728949442122853e-06, + "loss": 0.0005, + "num_tokens": 1598305042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5581633523939575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004363758262399082, + "kl": 0.0130615234375, + "learning_rate": 9.722993794768123e-06, + "loss": 0.0005, + "num_tokens": 1598872866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5583340445506529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008112016349273115, + "kl": 0.0130157470703125, + "learning_rate": 9.717038245740591e-06, + "loss": 0.0005, + "num_tokens": 1599452962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5585047367073483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000537040012646514, + "kl": 0.0133056640625, + "learning_rate": 9.711082797154269e-06, + "loss": 0.0005, + "num_tokens": 1600014226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5586754288640436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021857782132558888, + "kl": 0.011993408203125, + "learning_rate": 9.705127451123122e-06, + "loss": 0.0005, + "num_tokens": 1600589922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.558846121020739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017470038529665476, + "kl": 0.012481689453125, + "learning_rate": 9.69917220976109e-06, + "loss": 0.0005, + "num_tokens": 1601161378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5590168131774345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015522471276089949, + "kl": 0.012969970703125, + "learning_rate": 9.693217075182062e-06, + "loss": 0.0005, + "num_tokens": 1601730514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5591875053341299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021058163911945763, + "kl": 0.01287841796875, + "learning_rate": 9.687262049499898e-06, + "loss": 0.0005, + "num_tokens": 1602296866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5593581974908253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015066454092992267, + "kl": 0.0139923095703125, + "learning_rate": 9.681307134828415e-06, + "loss": 0.0006, + "num_tokens": 1602859506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5595288896475207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016894803029080956, + "kl": 0.0135650634765625, + "learning_rate": 9.675352333281399e-06, + "loss": 0.0005, + "num_tokens": 1603419602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5596995818042161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004171956050431856, + "kl": 0.012908935546875, + "learning_rate": 9.669397646972586e-06, + "loss": 0.0005, + "num_tokens": 1603982018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5598702739609115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042677133531516235, + "kl": 0.012603759765625, + "learning_rate": 9.663443078015669e-06, + "loss": 0.0005, + "num_tokens": 1604547746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5600409661176069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003975409625348159, + "kl": 0.0128326416015625, + "learning_rate": 9.657488628524313e-06, + "loss": 0.0005, + "num_tokens": 1605109570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5602116582743023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008465907818292068, + "kl": 0.01300048828125, + "learning_rate": 9.651534300612134e-06, + "loss": 0.0005, + "num_tokens": 1605683826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5603823504309977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006791987852792816, + "kl": 0.0130157470703125, + "learning_rate": 9.645580096392695e-06, + "loss": 0.0005, + "num_tokens": 1606251010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5605530425876931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007499680192370059, + "kl": 0.0127105712890625, + "learning_rate": 9.639626017979526e-06, + "loss": 0.0005, + "num_tokens": 1606815490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5607237347443885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008038075559166687, + "kl": 0.01324462890625, + "learning_rate": 9.633672067486116e-06, + "loss": 0.0005, + "num_tokens": 1607378338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5608944269010839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006110822197469292, + "kl": 0.01226806640625, + "learning_rate": 9.627718247025897e-06, + "loss": 0.0005, + "num_tokens": 1607949906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5610651190577793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003493490427124352, + "kl": 0.0129852294921875, + "learning_rate": 9.621764558712263e-06, + "loss": 0.0005, + "num_tokens": 1608518642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5612358112144747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005580290967835527, + "kl": 0.012481689453125, + "learning_rate": 9.615811004658555e-06, + "loss": 0.0005, + "num_tokens": 1609086770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.56140650337117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005256322079098542, + "kl": 0.0128021240234375, + "learning_rate": 9.609857586978073e-06, + "loss": 0.0005, + "num_tokens": 1609655378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5615771955278654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019207052396538488, + "kl": 0.0135345458984375, + "learning_rate": 9.603904307784064e-06, + "loss": 0.0005, + "num_tokens": 1610225394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5617478876845609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014307490315112144, + "kl": 0.0130157470703125, + "learning_rate": 9.597951169189727e-06, + "loss": 0.0005, + "num_tokens": 1610788834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5619185798412563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007254632273131154, + "kl": 0.013153076171875, + "learning_rate": 9.591998173308211e-06, + "loss": 0.0005, + "num_tokens": 1611350242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5620892719979517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002132955805041755, + "kl": 0.0125732421875, + "learning_rate": 9.586045322252617e-06, + "loss": 0.0005, + "num_tokens": 1611910802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5622599641546471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030477992548293506, + "kl": 0.0123291015625, + "learning_rate": 9.580092618135993e-06, + "loss": 0.0005, + "num_tokens": 1612476946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5624306563113425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001106287109847298, + "kl": 0.0131072998046875, + "learning_rate": 9.574140063071328e-06, + "loss": 0.0005, + "num_tokens": 1613040578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5626013484680379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031965887593571904, + "kl": 0.0126953125, + "learning_rate": 9.568187659171569e-06, + "loss": 0.0005, + "num_tokens": 1613618258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5627720406247333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002499403803997984, + "kl": 0.013092041015625, + "learning_rate": 9.562235408549603e-06, + "loss": 0.0005, + "num_tokens": 1614182626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5629427327814287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016150498457517232, + "kl": 0.013092041015625, + "learning_rate": 9.556283313318267e-06, + "loss": 0.0005, + "num_tokens": 1614745810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5631134249381241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001503952323768632, + "kl": 0.0130157470703125, + "learning_rate": 9.550331375590335e-06, + "loss": 0.0005, + "num_tokens": 1615308978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5632841170948195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001078473991260414, + "kl": 0.0132293701171875, + "learning_rate": 9.544379597478532e-06, + "loss": 0.0005, + "num_tokens": 1615873266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5634548092515149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000905226074039135, + "kl": 0.012420654296875, + "learning_rate": 9.538427981095526e-06, + "loss": 0.0005, + "num_tokens": 1616435010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5636255014082103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036776961821968594, + "kl": 0.01300048828125, + "learning_rate": 9.53247652855393e-06, + "loss": 0.0005, + "num_tokens": 1616997346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5637961935649057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014205108084278596, + "kl": 0.0127105712890625, + "learning_rate": 9.52652524196628e-06, + "loss": 0.0005, + "num_tokens": 1617559938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5639668857216011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009811009055946602, + "kl": 0.0137786865234375, + "learning_rate": 9.520574123445085e-06, + "loss": 0.0006, + "num_tokens": 1618131954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5641375778782965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017645321474725156, + "kl": 0.0130462646484375, + "learning_rate": 9.514623175102766e-06, + "loss": 0.0005, + "num_tokens": 1618693714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5643082700349918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017582261904675687, + "kl": 0.0125274658203125, + "learning_rate": 9.508672399051702e-06, + "loss": 0.0005, + "num_tokens": 1619262994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5644789621916873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014237366299424222, + "kl": 0.0130615234375, + "learning_rate": 9.502721797404198e-06, + "loss": 0.0005, + "num_tokens": 1619827138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5646496543483827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008154681904296008, + "kl": 0.0131683349609375, + "learning_rate": 9.496771372272506e-06, + "loss": 0.0005, + "num_tokens": 1620395762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5648203465050781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006537341830411414, + "kl": 0.0128173828125, + "learning_rate": 9.490821125768809e-06, + "loss": 0.0005, + "num_tokens": 1620958850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5649910386617735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007546907981427298, + "kl": 0.0130462646484375, + "learning_rate": 9.484871060005236e-06, + "loss": 0.0005, + "num_tokens": 1621522994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5651617308184689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009929211356952364, + "kl": 0.012542724609375, + "learning_rate": 9.478921177093841e-06, + "loss": 0.0005, + "num_tokens": 1622089762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5653324229751643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018037813331590907, + "kl": 0.012481689453125, + "learning_rate": 9.472971479146614e-06, + "loss": 0.0005, + "num_tokens": 1622652626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5655031151318597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005028515079090742, + "kl": 0.01300048828125, + "learning_rate": 9.46702196827549e-06, + "loss": 0.0005, + "num_tokens": 1623229170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5656738072885551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010352878700962014, + "kl": 0.0128173828125, + "learning_rate": 9.461072646592331e-06, + "loss": 0.0005, + "num_tokens": 1623788866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5658444994452505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006498469219147193, + "kl": 0.013397216796875, + "learning_rate": 9.455123516208925e-06, + "loss": 0.0005, + "num_tokens": 1624357650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5660151916019459, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018235179107326714, + "kl": 0.01348876953125, + "learning_rate": 9.449174579237001e-06, + "loss": 0.0005, + "num_tokens": 1624924914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5661858837586413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011219069080625782, + "kl": 0.0123138427734375, + "learning_rate": 9.44322583778822e-06, + "loss": 0.0005, + "num_tokens": 1625488114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5663565759153367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040436977242387537, + "kl": 0.01336669921875, + "learning_rate": 9.437277293974169e-06, + "loss": 0.0005, + "num_tokens": 1626059538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5665272680720321, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001096950109431169, + "kl": 0.0124664306640625, + "learning_rate": 9.431328949906363e-06, + "loss": 0.0005, + "num_tokens": 1626622738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5666979602287275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020864914893728833, + "kl": 0.0128173828125, + "learning_rate": 9.425380807696251e-06, + "loss": 0.0005, + "num_tokens": 1627187218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5668686523854229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003701614357012892, + "kl": 0.01336669921875, + "learning_rate": 9.419432869455211e-06, + "loss": 0.0005, + "num_tokens": 1627754050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5670393445421182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033243959399743486, + "kl": 0.0128021240234375, + "learning_rate": 9.413485137294548e-06, + "loss": 0.0005, + "num_tokens": 1628323330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5672100366988136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040251713460262047, + "kl": 0.0134124755859375, + "learning_rate": 9.407537613325485e-06, + "loss": 0.0005, + "num_tokens": 1628889986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.567380728855509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005253044397816871, + "kl": 0.012481689453125, + "learning_rate": 9.401590299659185e-06, + "loss": 0.0005, + "num_tokens": 1629452338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5675514210122045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006107168334755947, + "kl": 0.0131072998046875, + "learning_rate": 9.395643198406727e-06, + "loss": 0.0005, + "num_tokens": 1630018962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5677221131688999, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014412004987056118, + "kl": 0.0129241943359375, + "learning_rate": 9.389696311679119e-06, + "loss": 0.0005, + "num_tokens": 1630588370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5678928053255953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005158115438327462, + "kl": 0.012908935546875, + "learning_rate": 9.383749641587285e-06, + "loss": 0.0005, + "num_tokens": 1631161314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5680634974822907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007799931699257092, + "kl": 0.013427734375, + "learning_rate": 9.377803190242086e-06, + "loss": 0.0005, + "num_tokens": 1631731042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5682341896389861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006169958778358152, + "kl": 0.0130615234375, + "learning_rate": 9.371856959754293e-06, + "loss": 0.0005, + "num_tokens": 1632291522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5684048817956815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010840444482634304, + "kl": 0.0129547119140625, + "learning_rate": 9.365910952234609e-06, + "loss": 0.0005, + "num_tokens": 1632851330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5685755739523769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003910819649774509, + "kl": 0.0124969482421875, + "learning_rate": 9.359965169793644e-06, + "loss": 0.0005, + "num_tokens": 1633417842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5687462661090723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005900833572024282, + "kl": 0.012664794921875, + "learning_rate": 9.35401961454194e-06, + "loss": 0.0005, + "num_tokens": 1633980162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5689169582657677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006692906794876518, + "kl": 0.0126495361328125, + "learning_rate": 9.348074288589953e-06, + "loss": 0.0005, + "num_tokens": 1634545954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5690876504224631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007953651294956135, + "kl": 0.0124053955078125, + "learning_rate": 9.342129194048063e-06, + "loss": 0.0005, + "num_tokens": 1635108754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5692583425791585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010910994696527138, + "kl": 0.01300048828125, + "learning_rate": 9.33618433302656e-06, + "loss": 0.0005, + "num_tokens": 1635685682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5694290347358539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003707274153344208, + "kl": 0.01300048828125, + "learning_rate": 9.330239707635657e-06, + "loss": 0.0005, + "num_tokens": 1636254306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5695997268925493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015362934354020948, + "kl": 0.0126190185546875, + "learning_rate": 9.324295319985479e-06, + "loss": 0.0005, + "num_tokens": 1636816578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5697704190492446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016333417265050087, + "kl": 0.0126953125, + "learning_rate": 9.318351172186074e-06, + "loss": 0.0005, + "num_tokens": 1637380466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.56994111120594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022850884874386316, + "kl": 0.013214111328125, + "learning_rate": 9.312407266347396e-06, + "loss": 0.0005, + "num_tokens": 1637943426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5701118033626354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004464934144693786, + "kl": 0.0124969482421875, + "learning_rate": 9.306463604579314e-06, + "loss": 0.0005, + "num_tokens": 1638513906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5702824955193309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030269908753043957, + "kl": 0.01318359375, + "learning_rate": 9.30052018899162e-06, + "loss": 0.0005, + "num_tokens": 1639078514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5704531876760263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004141981661259525, + "kl": 0.0124359130859375, + "learning_rate": 9.294577021694013e-06, + "loss": 0.0005, + "num_tokens": 1639645730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5706238798327217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002361803430780655, + "kl": 0.013336181640625, + "learning_rate": 9.288634104796094e-06, + "loss": 0.0005, + "num_tokens": 1640211570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5707945719894171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000262397946626686, + "kl": 0.0127105712890625, + "learning_rate": 9.282691440407387e-06, + "loss": 0.0005, + "num_tokens": 1640780114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5709652641461125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040955669326492186, + "kl": 0.012420654296875, + "learning_rate": 9.27674903063733e-06, + "loss": 0.0005, + "num_tokens": 1641349090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5711359563028079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002716905637706711, + "kl": 0.0127716064453125, + "learning_rate": 9.27080687759526e-06, + "loss": 0.0005, + "num_tokens": 1641917234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5713066484595033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031616292826180985, + "kl": 0.0131988525390625, + "learning_rate": 9.264864983390423e-06, + "loss": 0.0005, + "num_tokens": 1642481778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5714773406161987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017009486690706366, + "kl": 0.0133209228515625, + "learning_rate": 9.25892335013198e-06, + "loss": 0.0005, + "num_tokens": 1643046850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5716480327728941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005369168464102019, + "kl": 0.0127410888671875, + "learning_rate": 9.252981979928997e-06, + "loss": 0.0005, + "num_tokens": 1643617698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5718187249295895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000477292019884734, + "kl": 0.0128021240234375, + "learning_rate": 9.247040874890447e-06, + "loss": 0.0005, + "num_tokens": 1644180530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5719894170862849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000532724026967056, + "kl": 0.01275634765625, + "learning_rate": 9.241100037125205e-06, + "loss": 0.0005, + "num_tokens": 1644742882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5721601092429803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005602440250473206, + "kl": 0.0127410888671875, + "learning_rate": 9.235159468742053e-06, + "loss": 0.0005, + "num_tokens": 1645316530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5723308013996757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006337859654902406, + "kl": 0.0129547119140625, + "learning_rate": 9.229219171849683e-06, + "loss": 0.0005, + "num_tokens": 1645880066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.572501493556371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003321052130168187, + "kl": 0.0126953125, + "learning_rate": 9.223279148556685e-06, + "loss": 0.0005, + "num_tokens": 1646445474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5726721857130664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007567327508137515, + "kl": 0.012969970703125, + "learning_rate": 9.217339400971546e-06, + "loss": 0.0005, + "num_tokens": 1647009234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5728428778697618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004968843106710029, + "kl": 0.0123443603515625, + "learning_rate": 9.21139993120267e-06, + "loss": 0.0005, + "num_tokens": 1647573922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5730135700264573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004987133310973328, + "kl": 0.0128326416015625, + "learning_rate": 9.205460741358353e-06, + "loss": 0.0005, + "num_tokens": 1648145106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5731842621831527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015981632768254771, + "kl": 0.0127716064453125, + "learning_rate": 9.199521833546792e-06, + "loss": 0.0005, + "num_tokens": 1648713730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5733549543398481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004514331296419093, + "kl": 0.012481689453125, + "learning_rate": 9.19358320987608e-06, + "loss": 0.0005, + "num_tokens": 1649278162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5735256464965435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011567629155900167, + "kl": 0.0130615234375, + "learning_rate": 9.187644872454222e-06, + "loss": 0.0005, + "num_tokens": 1649842370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5736963386532389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006507934366039667, + "kl": 0.0123443603515625, + "learning_rate": 9.181706823389107e-06, + "loss": 0.0005, + "num_tokens": 1650414386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5738670308099343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004461218420380088, + "kl": 0.0133209228515625, + "learning_rate": 9.175769064788538e-06, + "loss": 0.0005, + "num_tokens": 1650980898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5740377229666297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004383871875369161, + "kl": 0.012481689453125, + "learning_rate": 9.16983159876019e-06, + "loss": 0.0005, + "num_tokens": 1651547426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5742084151233251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005855950369811259, + "kl": 0.01220703125, + "learning_rate": 9.163894427411662e-06, + "loss": 0.0005, + "num_tokens": 1652109874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5743791072800205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002681573521280015, + "kl": 0.013031005859375, + "learning_rate": 9.157957552850426e-06, + "loss": 0.0005, + "num_tokens": 1652684434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5745497994367159, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.551953230078516e-05, + "kl": 0.0125274658203125, + "learning_rate": 9.152020977183869e-06, + "loss": 0.0005, + "num_tokens": 1653244434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5747204915934113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005264012597062274, + "kl": 0.0126800537109375, + "learning_rate": 9.146084702519252e-06, + "loss": 0.0005, + "num_tokens": 1653804338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5748911837501067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006513175649984426, + "kl": 0.012908935546875, + "learning_rate": 9.14014873096374e-06, + "loss": 0.0005, + "num_tokens": 1654370562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5750618759068021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034472458968726103, + "kl": 0.01263427734375, + "learning_rate": 9.134213064624387e-06, + "loss": 0.0005, + "num_tokens": 1654933506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5752325680634974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003567238420450334, + "kl": 0.012359619140625, + "learning_rate": 9.128277705608148e-06, + "loss": 0.0005, + "num_tokens": 1655498482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5754032602201928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020117062875422325, + "kl": 0.0135040283203125, + "learning_rate": 9.122342656021855e-06, + "loss": 0.0005, + "num_tokens": 1656061250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5755739523768882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017046800751776988, + "kl": 0.01312255859375, + "learning_rate": 9.116407917972235e-06, + "loss": 0.0005, + "num_tokens": 1656628514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5757446445335836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015927532953120813, + "kl": 0.0131988525390625, + "learning_rate": 9.11047349356591e-06, + "loss": 0.0005, + "num_tokens": 1657195858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5759153366902791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011288323036629298, + "kl": 0.0125274658203125, + "learning_rate": 9.104539384909387e-06, + "loss": 0.0005, + "num_tokens": 1657762162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5760860288469745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014776287482172245, + "kl": 0.0125885009765625, + "learning_rate": 9.098605594109055e-06, + "loss": 0.0005, + "num_tokens": 1658336658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5762567210036699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001031168644360462, + "kl": 0.0124969482421875, + "learning_rate": 9.092672123271197e-06, + "loss": 0.0005, + "num_tokens": 1658912658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5764274131603653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008806085978042568, + "kl": 0.0131072998046875, + "learning_rate": 9.086738974501986e-06, + "loss": 0.0005, + "num_tokens": 1659478242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5765981053170607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026515017557519096, + "kl": 0.0132293701171875, + "learning_rate": 9.080806149907474e-06, + "loss": 0.0005, + "num_tokens": 1660048194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5767687974737561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013151541108041573, + "kl": 0.0123748779296875, + "learning_rate": 9.074873651593596e-06, + "loss": 0.0005, + "num_tokens": 1660612066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5769394896304515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024545215093147604, + "kl": 0.0124969482421875, + "learning_rate": 9.068941481666172e-06, + "loss": 0.0005, + "num_tokens": 1661174866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5771101817871469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003442113865958573, + "kl": 0.012359619140625, + "learning_rate": 9.063009642230917e-06, + "loss": 0.0005, + "num_tokens": 1661745522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5772808739438423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012214303270972768, + "kl": 0.012603759765625, + "learning_rate": 9.057078135393417e-06, + "loss": 0.0005, + "num_tokens": 1662305106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5774515661005377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010776002487613992, + "kl": 0.0126190185546875, + "learning_rate": 9.051146963259137e-06, + "loss": 0.0005, + "num_tokens": 1662867474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5776222582572331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005215062979756558, + "kl": 0.0121307373046875, + "learning_rate": 9.045216127933435e-06, + "loss": 0.0005, + "num_tokens": 1663429970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5777929504139285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006345801014819257, + "kl": 0.013092041015625, + "learning_rate": 9.039285631521544e-06, + "loss": 0.0005, + "num_tokens": 1663994322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5779636425706238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002453481622206308, + "kl": 0.0134429931640625, + "learning_rate": 9.033355476128574e-06, + "loss": 0.0005, + "num_tokens": 1664557538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5781343347273192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004301081026707089, + "kl": 0.012786865234375, + "learning_rate": 9.027425663859514e-06, + "loss": 0.0005, + "num_tokens": 1665128050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5783050268840146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003303322452600682, + "kl": 0.0130462646484375, + "learning_rate": 9.021496196819236e-06, + "loss": 0.0005, + "num_tokens": 1665696850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.57847571904071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003469163161299306, + "kl": 0.0128936767578125, + "learning_rate": 9.015567077112484e-06, + "loss": 0.0005, + "num_tokens": 1666258530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5786464111974055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009151727041264977, + "kl": 0.0123748779296875, + "learning_rate": 9.00963830684389e-06, + "loss": 0.0005, + "num_tokens": 1666829154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5788171033541009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006418279573368094, + "kl": 0.013153076171875, + "learning_rate": 9.003709888117943e-06, + "loss": 0.0005, + "num_tokens": 1667391010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5789877955107963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006621011962276762, + "kl": 0.013031005859375, + "learning_rate": 8.99778182303902e-06, + "loss": 0.0005, + "num_tokens": 1667954018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5791584876674917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005580082014519761, + "kl": 0.012908935546875, + "learning_rate": 8.991854113711371e-06, + "loss": 0.0005, + "num_tokens": 1668523506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5793291798241871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010638569774193005, + "kl": 0.01397705078125, + "learning_rate": 8.985926762239126e-06, + "loss": 0.0006, + "num_tokens": 1669092930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5794998719808825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047501052287881516, + "kl": 0.01263427734375, + "learning_rate": 8.979999770726269e-06, + "loss": 0.0005, + "num_tokens": 1669666994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5796705641375779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001459832313275622, + "kl": 0.01336669921875, + "learning_rate": 8.974073141276677e-06, + "loss": 0.0005, + "num_tokens": 1670228002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5798412562942733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006351745375723284, + "kl": 0.012664794921875, + "learning_rate": 8.968146875994082e-06, + "loss": 0.0005, + "num_tokens": 1670788930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5800119484509687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009130364610635271, + "kl": 0.013031005859375, + "learning_rate": 8.962220976982103e-06, + "loss": 0.0005, + "num_tokens": 1671350498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5801826406076641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014528990492425518, + "kl": 0.0125732421875, + "learning_rate": 8.956295446344212e-06, + "loss": 0.0005, + "num_tokens": 1671922210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5803533327643595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001827809891894381, + "kl": 0.012725830078125, + "learning_rate": 8.950370286183762e-06, + "loss": 0.0005, + "num_tokens": 1672493314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5805240249210549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0041057883057878525, + "kl": 0.012786865234375, + "learning_rate": 8.944445498603976e-06, + "loss": 0.0005, + "num_tokens": 1673055762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5806947170777502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005826442805975279, + "kl": 0.01300048828125, + "learning_rate": 8.938521085707927e-06, + "loss": 0.0005, + "num_tokens": 1673623218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5808654092344456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003842484344477001, + "kl": 0.0124969482421875, + "learning_rate": 8.93259704959858e-06, + "loss": 0.0005, + "num_tokens": 1674187906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.581036101391141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010076672469399859, + "kl": 0.012908935546875, + "learning_rate": 8.926673392378746e-06, + "loss": 0.0005, + "num_tokens": 1674747810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5812067935478364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014656321521310645, + "kl": 0.0129547119140625, + "learning_rate": 8.92075011615112e-06, + "loss": 0.0005, + "num_tokens": 1675312178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5813774857045318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015520446294348751, + "kl": 0.0125274658203125, + "learning_rate": 8.914827223018236e-06, + "loss": 0.0005, + "num_tokens": 1675878498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5815481778612273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000640043432639793, + "kl": 0.012542724609375, + "learning_rate": 8.908904715082519e-06, + "loss": 0.0005, + "num_tokens": 1676444050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5817188700179227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004201251278907688, + "kl": 0.0129241943359375, + "learning_rate": 8.90298259444624e-06, + "loss": 0.0005, + "num_tokens": 1677005618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5818895621746181, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033102983088113018, + "kl": 0.01397705078125, + "learning_rate": 8.897060863211545e-06, + "loss": 0.0006, + "num_tokens": 1677570258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5820602543313135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017891267174818199, + "kl": 0.01385498046875, + "learning_rate": 8.891139523480424e-06, + "loss": 0.0006, + "num_tokens": 1678135410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5822309464880089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031100972570547786, + "kl": 0.01318359375, + "learning_rate": 8.885218577354747e-06, + "loss": 0.0005, + "num_tokens": 1678704914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5824016386447043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017112388659099018, + "kl": 0.013153076171875, + "learning_rate": 8.879298026936232e-06, + "loss": 0.0005, + "num_tokens": 1679269330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5825723308013997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003222555850333993, + "kl": 0.0130615234375, + "learning_rate": 8.873377874326466e-06, + "loss": 0.0005, + "num_tokens": 1679839826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5827430229580951, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006547618077304587, + "kl": 0.0130615234375, + "learning_rate": 8.867458121626883e-06, + "loss": 0.0005, + "num_tokens": 1680402738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5829137151147905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001655083605818078, + "kl": 0.01202392578125, + "learning_rate": 8.861538770938784e-06, + "loss": 0.0005, + "num_tokens": 1680970994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5830844072714859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008905709061701385, + "kl": 0.0132598876953125, + "learning_rate": 8.855619824363326e-06, + "loss": 0.0005, + "num_tokens": 1681539170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5832550994281813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002909199790295517, + "kl": 0.01287841796875, + "learning_rate": 8.849701284001523e-06, + "loss": 0.0005, + "num_tokens": 1682109058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5834257915848767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020053433572666645, + "kl": 0.0124664306640625, + "learning_rate": 8.84378315195424e-06, + "loss": 0.0005, + "num_tokens": 1682676754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.583596483741572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004991581416007982, + "kl": 0.0123291015625, + "learning_rate": 8.837865430322197e-06, + "loss": 0.0005, + "num_tokens": 1683241634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5837671758982674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004960801270968522, + "kl": 0.0130462646484375, + "learning_rate": 8.831948121205979e-06, + "loss": 0.0005, + "num_tokens": 1683807714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5839378680549628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036962445707256603, + "kl": 0.0132598876953125, + "learning_rate": 8.826031226706015e-06, + "loss": 0.0005, + "num_tokens": 1684379730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5841085602116582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018272165622532864, + "kl": 0.0129852294921875, + "learning_rate": 8.820114748922585e-06, + "loss": 0.0005, + "num_tokens": 1684944354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5842792523683537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008669715554448791, + "kl": 0.0134735107421875, + "learning_rate": 8.814198689955828e-06, + "loss": 0.0005, + "num_tokens": 1685506706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5844499445250491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000846666287569038, + "kl": 0.0128021240234375, + "learning_rate": 8.80828305190573e-06, + "loss": 0.0005, + "num_tokens": 1686069170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5846206366817445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000953999117033132, + "kl": 0.0128631591796875, + "learning_rate": 8.802367836872135e-06, + "loss": 0.0005, + "num_tokens": 1686632514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5847913288384399, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001535109421170908, + "kl": 0.01361083984375, + "learning_rate": 8.79645304695472e-06, + "loss": 0.0005, + "num_tokens": 1687225746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5849620209951353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005827789537807618, + "kl": 0.0125885009765625, + "learning_rate": 8.790538684253031e-06, + "loss": 0.0005, + "num_tokens": 1687795058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5851327131518307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007891405536557502, + "kl": 0.0133819580078125, + "learning_rate": 8.78462475086645e-06, + "loss": 0.0005, + "num_tokens": 1688358466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5853034053085261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003833499183987484, + "kl": 0.0128631591796875, + "learning_rate": 8.77871124889421e-06, + "loss": 0.0005, + "num_tokens": 1688926946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5854740974652215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018910339402625, + "kl": 0.012908935546875, + "learning_rate": 8.772798180435388e-06, + "loss": 0.0005, + "num_tokens": 1689485042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5856447896219169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000730148936776918, + "kl": 0.0130767822265625, + "learning_rate": 8.766885547588914e-06, + "loss": 0.0005, + "num_tokens": 1690051362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5858154817786123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005271491107400756, + "kl": 0.0129547119140625, + "learning_rate": 8.760973352453556e-06, + "loss": 0.0005, + "num_tokens": 1690614818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5859861739353077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041995629576883686, + "kl": 0.0135040283203125, + "learning_rate": 8.75506159712794e-06, + "loss": 0.0005, + "num_tokens": 1691183570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5861568660920031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005403149319637922, + "kl": 0.0125274658203125, + "learning_rate": 8.749150283710508e-06, + "loss": 0.0005, + "num_tokens": 1691746994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5863275582486984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008577289941818462, + "kl": 0.013214111328125, + "learning_rate": 8.743239414299576e-06, + "loss": 0.0005, + "num_tokens": 1692308978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5864982504053938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005908152705369779, + "kl": 0.0126800537109375, + "learning_rate": 8.737328990993283e-06, + "loss": 0.0005, + "num_tokens": 1692875746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5866689425620892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017552631048360528, + "kl": 0.01348876953125, + "learning_rate": 8.731419015889621e-06, + "loss": 0.0005, + "num_tokens": 1693441378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5868396347187846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034358116006812545, + "kl": 0.0124969482421875, + "learning_rate": 8.725509491086414e-06, + "loss": 0.0005, + "num_tokens": 1694004450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.58701032687548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035756266577069635, + "kl": 0.0135040283203125, + "learning_rate": 8.719600418681334e-06, + "loss": 0.0005, + "num_tokens": 1694566194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5871810190321755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008264839096015989, + "kl": 0.01324462890625, + "learning_rate": 8.71369180077188e-06, + "loss": 0.0005, + "num_tokens": 1695131538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5873517111888709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007749589610024819, + "kl": 0.013397216796875, + "learning_rate": 8.70778363945541e-06, + "loss": 0.0005, + "num_tokens": 1695693826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5875224033455663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005479907244584672, + "kl": 0.0126953125, + "learning_rate": 8.701875936829097e-06, + "loss": 0.0005, + "num_tokens": 1696257906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5876930955022617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002227951031304791, + "kl": 0.01318359375, + "learning_rate": 8.695968694989968e-06, + "loss": 0.0005, + "num_tokens": 1696825906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5878637876589571, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005797277911300934, + "kl": 0.0128936767578125, + "learning_rate": 8.690061916034877e-06, + "loss": 0.0005, + "num_tokens": 1697391346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5880344798156525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004694137226883188, + "kl": 0.01312255859375, + "learning_rate": 8.684155602060527e-06, + "loss": 0.0005, + "num_tokens": 1697962466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5882051719723479, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005181473577197986, + "kl": 0.0140380859375, + "learning_rate": 8.678249755163434e-06, + "loss": 0.0006, + "num_tokens": 1698532994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5883758641290433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011325676546291, + "kl": 0.0133209228515625, + "learning_rate": 8.672344377439964e-06, + "loss": 0.0005, + "num_tokens": 1699093714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5885465562857387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044683632091627835, + "kl": 0.0129852294921875, + "learning_rate": 8.666439470986316e-06, + "loss": 0.0005, + "num_tokens": 1699677490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5887172484424341, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005447307238915464, + "kl": 0.01300048828125, + "learning_rate": 8.66053503789852e-06, + "loss": 0.0005, + "num_tokens": 1700236898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5888879405991295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009335463729983333, + "kl": 0.0124359130859375, + "learning_rate": 8.654631080272431e-06, + "loss": 0.0005, + "num_tokens": 1700802242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5890586327558248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004587964125889496, + "kl": 0.01324462890625, + "learning_rate": 8.648727600203741e-06, + "loss": 0.0005, + "num_tokens": 1701368882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5892293249125202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002832020140405562, + "kl": 0.0128021240234375, + "learning_rate": 8.642824599787977e-06, + "loss": 0.0005, + "num_tokens": 1701933922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5894000170692156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000604835761520345, + "kl": 0.01300048828125, + "learning_rate": 8.636922081120493e-06, + "loss": 0.0005, + "num_tokens": 1702505266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.589570709225911, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005824797001683278, + "kl": 0.012969970703125, + "learning_rate": 8.631020046296462e-06, + "loss": 0.0005, + "num_tokens": 1703067778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5897414013826064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002244893738427231, + "kl": 0.013214111328125, + "learning_rate": 8.625118497410895e-06, + "loss": 0.0005, + "num_tokens": 1703632850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5899120935393019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006871126430560054, + "kl": 0.012939453125, + "learning_rate": 8.619217436558633e-06, + "loss": 0.0005, + "num_tokens": 1704191106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5900827856959973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007188667919417145, + "kl": 0.0127716064453125, + "learning_rate": 8.613316865834341e-06, + "loss": 0.0005, + "num_tokens": 1704763874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5902534778526927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000634236651035011, + "kl": 0.0127105712890625, + "learning_rate": 8.6074167873325e-06, + "loss": 0.0005, + "num_tokens": 1705335410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5904241700093881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007933934451349356, + "kl": 0.0130767822265625, + "learning_rate": 8.60151720314743e-06, + "loss": 0.0005, + "num_tokens": 1705898722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5905948621660835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012771186347316463, + "kl": 0.0130462646484375, + "learning_rate": 8.595618115373276e-06, + "loss": 0.0005, + "num_tokens": 1706468130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5907655543227789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044071223376176803, + "kl": 0.01336669921875, + "learning_rate": 8.589719526103995e-06, + "loss": 0.0005, + "num_tokens": 1707032370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5909362464794743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007069311403579728, + "kl": 0.0131988525390625, + "learning_rate": 8.583821437433369e-06, + "loss": 0.0005, + "num_tokens": 1707598082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5911069386361697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013854443914912385, + "kl": 0.0125579833984375, + "learning_rate": 8.577923851455014e-06, + "loss": 0.0005, + "num_tokens": 1708167954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5912776307928651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011129604039877288, + "kl": 0.01275634765625, + "learning_rate": 8.572026770262356e-06, + "loss": 0.0005, + "num_tokens": 1708732050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5914483229495605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004469820784654116, + "kl": 0.012115478515625, + "learning_rate": 8.566130195948654e-06, + "loss": 0.0005, + "num_tokens": 1709296930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5916190151062559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002774055428321039, + "kl": 0.012969970703125, + "learning_rate": 8.560234130606967e-06, + "loss": 0.0005, + "num_tokens": 1709861666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5917897072629512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011039382072589878, + "kl": 0.012603759765625, + "learning_rate": 8.554338576330192e-06, + "loss": 0.0005, + "num_tokens": 1710428162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5919603994196466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002527842029417965, + "kl": 0.0126495361328125, + "learning_rate": 8.548443535211034e-06, + "loss": 0.0005, + "num_tokens": 1710994978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.592131091576342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010123327726599043, + "kl": 0.0130157470703125, + "learning_rate": 8.54254900934203e-06, + "loss": 0.0005, + "num_tokens": 1711557410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5923017837330374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007430081059212213, + "kl": 0.0132904052734375, + "learning_rate": 8.536655000815514e-06, + "loss": 0.0005, + "num_tokens": 1712120434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5924724758897328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009039251431122269, + "kl": 0.012359619140625, + "learning_rate": 8.530761511723647e-06, + "loss": 0.0005, + "num_tokens": 1712684530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5926431680464282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013565990133969045, + "kl": 0.0136871337890625, + "learning_rate": 8.524868544158407e-06, + "loss": 0.0005, + "num_tokens": 1713248562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5928138602031237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005244567071292372, + "kl": 0.01287841796875, + "learning_rate": 8.518976100211587e-06, + "loss": 0.0005, + "num_tokens": 1713811026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5929845523598191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010546887147839382, + "kl": 0.0130157470703125, + "learning_rate": 8.51308418197479e-06, + "loss": 0.0005, + "num_tokens": 1714373186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5931552445165145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003257464828426257, + "kl": 0.0123138427734375, + "learning_rate": 8.507192791539428e-06, + "loss": 0.0005, + "num_tokens": 1714936738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5933259366732099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006562350035753072, + "kl": 0.0130615234375, + "learning_rate": 8.501301930996742e-06, + "loss": 0.0005, + "num_tokens": 1715502162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5934966288299053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011231159426850436, + "kl": 0.0134429931640625, + "learning_rate": 8.495411602437772e-06, + "loss": 0.0005, + "num_tokens": 1716068738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5936673209866007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007239377067115066, + "kl": 0.012542724609375, + "learning_rate": 8.48952180795337e-06, + "loss": 0.0005, + "num_tokens": 1716630978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5938380131432961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000272307559746307, + "kl": 0.0134124755859375, + "learning_rate": 8.483632549634198e-06, + "loss": 0.0005, + "num_tokens": 1717194306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5940087052999915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007167009053965693, + "kl": 0.01263427734375, + "learning_rate": 8.477743829570734e-06, + "loss": 0.0005, + "num_tokens": 1717763186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5941793974566869, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015272065947457268, + "kl": 0.0128173828125, + "learning_rate": 8.471855649853265e-06, + "loss": 0.0005, + "num_tokens": 1718329730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5943500896133823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037824455357419914, + "kl": 0.0135040283203125, + "learning_rate": 8.465968012571874e-06, + "loss": 0.0005, + "num_tokens": 1718894242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5945207817700776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001426992039946889, + "kl": 0.0125885009765625, + "learning_rate": 8.46008091981646e-06, + "loss": 0.0005, + "num_tokens": 1719465602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.594691473926773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036119485004901085, + "kl": 0.01312255859375, + "learning_rate": 8.454194373676735e-06, + "loss": 0.0005, + "num_tokens": 1720027538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5948621660834684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008551323585552917, + "kl": 0.0131378173828125, + "learning_rate": 8.44830837624221e-06, + "loss": 0.0005, + "num_tokens": 1720585330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5950328582401638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005456588371558655, + "kl": 0.0136566162109375, + "learning_rate": 8.442422929602192e-06, + "loss": 0.0005, + "num_tokens": 1721148626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5952035503968592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004947620752783358, + "kl": 0.0133209228515625, + "learning_rate": 8.436538035845811e-06, + "loss": 0.0005, + "num_tokens": 1721723602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5953742425535546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034657724225791787, + "kl": 0.0128021240234375, + "learning_rate": 8.430653697061991e-06, + "loss": 0.0005, + "num_tokens": 1722286626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.59554493471025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005641644518378794, + "kl": 0.0125885009765625, + "learning_rate": 8.42476991533946e-06, + "loss": 0.0005, + "num_tokens": 1722856930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5957156268669455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006090788966184943, + "kl": 0.01300048828125, + "learning_rate": 8.418886692766743e-06, + "loss": 0.0005, + "num_tokens": 1723420754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5958863190236409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025572512950744063, + "kl": 0.0131378173828125, + "learning_rate": 8.413004031432176e-06, + "loss": 0.0005, + "num_tokens": 1723988338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5960570111803363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005591677323940029, + "kl": 0.0125274658203125, + "learning_rate": 8.407121933423891e-06, + "loss": 0.0005, + "num_tokens": 1724551074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5962277033370317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026280482843834245, + "kl": 0.0134735107421875, + "learning_rate": 8.401240400829822e-06, + "loss": 0.0005, + "num_tokens": 1725116130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5963983954937271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006194130950881612, + "kl": 0.012969970703125, + "learning_rate": 8.395359435737694e-06, + "loss": 0.0005, + "num_tokens": 1725684546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5965690876504225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007329259790004343, + "kl": 0.0128936767578125, + "learning_rate": 8.389479040235045e-06, + "loss": 0.0005, + "num_tokens": 1726251330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5967397798071179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009471515924786999, + "kl": 0.012542724609375, + "learning_rate": 8.383599216409198e-06, + "loss": 0.0005, + "num_tokens": 1726814706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5969104719638133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017389037572650666, + "kl": 0.0129241943359375, + "learning_rate": 8.377719966347287e-06, + "loss": 0.0005, + "num_tokens": 1727379746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5970811641205087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048684944482055325, + "kl": 0.013153076171875, + "learning_rate": 8.371841292136221e-06, + "loss": 0.0005, + "num_tokens": 1727942050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.597251856277204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013959743057972938, + "kl": 0.0126800537109375, + "learning_rate": 8.365963195862725e-06, + "loss": 0.0005, + "num_tokens": 1728503090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5974225484338994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002789675489475943, + "kl": 0.012664794921875, + "learning_rate": 8.360085679613306e-06, + "loss": 0.0005, + "num_tokens": 1729069154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5975932405905948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002689738808392745, + "kl": 0.0133514404296875, + "learning_rate": 8.354208745474279e-06, + "loss": 0.0005, + "num_tokens": 1729629250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5977639327472902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007127486678952855, + "kl": 0.0133209228515625, + "learning_rate": 8.348332395531735e-06, + "loss": 0.0005, + "num_tokens": 1730195970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5979346249039856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004170392865158679, + "kl": 0.012908935546875, + "learning_rate": 8.342456631871569e-06, + "loss": 0.0005, + "num_tokens": 1730761218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.598105317060681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008444526171363357, + "kl": 0.012939453125, + "learning_rate": 8.336581456579462e-06, + "loss": 0.0005, + "num_tokens": 1731321698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5982760092173764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019411845152391384, + "kl": 0.0133209228515625, + "learning_rate": 8.330706871740896e-06, + "loss": 0.0005, + "num_tokens": 1731899890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5984467013740719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005176707968962345, + "kl": 0.013458251953125, + "learning_rate": 8.324832879441132e-06, + "loss": 0.0005, + "num_tokens": 1732464018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5986173935307673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014074463235010544, + "kl": 0.01226806640625, + "learning_rate": 8.318959481765222e-06, + "loss": 0.0005, + "num_tokens": 1733027746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5987880856874627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020465576329343135, + "kl": 0.013031005859375, + "learning_rate": 8.313086680798017e-06, + "loss": 0.0005, + "num_tokens": 1733589634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5989587778441581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007731148354175758, + "kl": 0.0130462646484375, + "learning_rate": 8.307214478624148e-06, + "loss": 0.0005, + "num_tokens": 1734155970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5991294700008535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007834664867386066, + "kl": 0.0139617919921875, + "learning_rate": 8.301342877328032e-06, + "loss": 0.0006, + "num_tokens": 1734728242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5993001621575489, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028839858929824035, + "kl": 0.0123443603515625, + "learning_rate": 8.295471878993873e-06, + "loss": 0.0005, + "num_tokens": 1735297394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5994708543142443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004711372340857144, + "kl": 0.012786865234375, + "learning_rate": 8.289601485705669e-06, + "loss": 0.0005, + "num_tokens": 1735863234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5996415464709397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024083454550028876, + "kl": 0.01220703125, + "learning_rate": 8.283731699547198e-06, + "loss": 0.0005, + "num_tokens": 1736429986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5998122386276351, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012069454224948016, + "kl": 0.0129852294921875, + "learning_rate": 8.277862522602019e-06, + "loss": 0.0005, + "num_tokens": 1737001698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5999829307843305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009024229215626186, + "kl": 0.0124359130859375, + "learning_rate": 8.271993956953474e-06, + "loss": 0.0005, + "num_tokens": 1737569746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6001536229410258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002345461128091502, + "kl": 0.0125885009765625, + "learning_rate": 8.266126004684702e-06, + "loss": 0.0005, + "num_tokens": 1738134034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6003243150977212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022205619800553054, + "kl": 0.013824462890625, + "learning_rate": 8.26025866787861e-06, + "loss": 0.0006, + "num_tokens": 1738694834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6004950072544166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007817818910740001, + "kl": 0.012176513671875, + "learning_rate": 8.254391948617885e-06, + "loss": 0.0005, + "num_tokens": 1739257682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.600665699411112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006164718442260716, + "kl": 0.0130767822265625, + "learning_rate": 8.24852584898501e-06, + "loss": 0.0005, + "num_tokens": 1739825442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6008363915678074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010061069954637322, + "kl": 0.012908935546875, + "learning_rate": 8.242660371062231e-06, + "loss": 0.0005, + "num_tokens": 1740392322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6010070837245028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007402050225730569, + "kl": 0.0132293701171875, + "learning_rate": 8.236795516931587e-06, + "loss": 0.0005, + "num_tokens": 1740954866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6011777758811983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002577830777364602, + "kl": 0.01287841796875, + "learning_rate": 8.230931288674882e-06, + "loss": 0.0005, + "num_tokens": 1741522450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6013484680378937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001985377671865731, + "kl": 0.0129241943359375, + "learning_rate": 8.225067688373713e-06, + "loss": 0.0005, + "num_tokens": 1742087122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6015191601945891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040293817256050276, + "kl": 0.0130157470703125, + "learning_rate": 8.219204718109442e-06, + "loss": 0.0005, + "num_tokens": 1742651650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6016898523512845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005398052202293633, + "kl": 0.013641357421875, + "learning_rate": 8.213342379963217e-06, + "loss": 0.0005, + "num_tokens": 1743213666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6018605445079799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008119504470296162, + "kl": 0.0132904052734375, + "learning_rate": 8.207480676015946e-06, + "loss": 0.0005, + "num_tokens": 1743776530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6020312366646753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006650555457204761, + "kl": 0.012542724609375, + "learning_rate": 8.201619608348332e-06, + "loss": 0.0005, + "num_tokens": 1744342770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6022019288213707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004067549289101596, + "kl": 0.0131683349609375, + "learning_rate": 8.195759179040839e-06, + "loss": 0.0005, + "num_tokens": 1744912466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6023726209780661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000398849051668817, + "kl": 0.013275146484375, + "learning_rate": 8.18989939017371e-06, + "loss": 0.0005, + "num_tokens": 1745476626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6025433131347615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029566006892522306, + "kl": 0.01300048828125, + "learning_rate": 8.184040243826954e-06, + "loss": 0.0005, + "num_tokens": 1746043602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6027140052914569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022686456960766202, + "kl": 0.0130462646484375, + "learning_rate": 8.17818174208036e-06, + "loss": 0.0005, + "num_tokens": 1746607586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6028846974481522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016954888233688997, + "kl": 0.0134124755859375, + "learning_rate": 8.172323887013483e-06, + "loss": 0.0005, + "num_tokens": 1747172482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6030553896048476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007017045752079068, + "kl": 0.0135040283203125, + "learning_rate": 8.166466680705652e-06, + "loss": 0.0005, + "num_tokens": 1747741010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.603226081761543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004968313868364668, + "kl": 0.0129547119140625, + "learning_rate": 8.160610125235963e-06, + "loss": 0.0005, + "num_tokens": 1748305474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6033967739182384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045976206043555045, + "kl": 0.012908935546875, + "learning_rate": 8.154754222683279e-06, + "loss": 0.0005, + "num_tokens": 1748876978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6035674660749338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001027650277468948, + "kl": 0.012969970703125, + "learning_rate": 8.148898975126237e-06, + "loss": 0.0005, + "num_tokens": 1749440690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6037381582316292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002275582802408741, + "kl": 0.01318359375, + "learning_rate": 8.143044384643242e-06, + "loss": 0.0005, + "num_tokens": 1750003730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6039088503883246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004637716649237072, + "kl": 0.012725830078125, + "learning_rate": 8.137190453312453e-06, + "loss": 0.0005, + "num_tokens": 1750566882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.60407954254502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010806320101778974, + "kl": 0.01287841796875, + "learning_rate": 8.131337183211811e-06, + "loss": 0.0005, + "num_tokens": 1751129714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6042502347017155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002535871400805299, + "kl": 0.013092041015625, + "learning_rate": 8.125484576419014e-06, + "loss": 0.0005, + "num_tokens": 1751694194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6044209268584109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000154613250948397, + "kl": 0.0128936767578125, + "learning_rate": 8.119632635011529e-06, + "loss": 0.0005, + "num_tokens": 1752267730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6045916190151063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003824313723366254, + "kl": 0.0131683349609375, + "learning_rate": 8.113781361066579e-06, + "loss": 0.0005, + "num_tokens": 1752832066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6047623111718017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001323922964569407, + "kl": 0.0130462646484375, + "learning_rate": 8.107930756661155e-06, + "loss": 0.0005, + "num_tokens": 1753396146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6049330033284971, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019265661558231186, + "kl": 0.012786865234375, + "learning_rate": 8.102080823872016e-06, + "loss": 0.0005, + "num_tokens": 1753962466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6051036954851925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002202099816556693, + "kl": 0.012786865234375, + "learning_rate": 8.096231564775674e-06, + "loss": 0.0005, + "num_tokens": 1754530146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6052743876418879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013634288747035235, + "kl": 0.01324462890625, + "learning_rate": 8.090382981448403e-06, + "loss": 0.0005, + "num_tokens": 1755097426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6054450797985833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014220689361748933, + "kl": 0.0136871337890625, + "learning_rate": 8.084535075966237e-06, + "loss": 0.0005, + "num_tokens": 1755663906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6056157719552786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003249736353502215, + "kl": 0.0128173828125, + "learning_rate": 8.078687850404977e-06, + "loss": 0.0005, + "num_tokens": 1756229986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.605786464111974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000195172209647997, + "kl": 0.0136566162109375, + "learning_rate": 8.07284130684018e-06, + "loss": 0.0005, + "num_tokens": 1756796786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6059571562686694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013556934097951113, + "kl": 0.0128021240234375, + "learning_rate": 8.066995447347146e-06, + "loss": 0.0005, + "num_tokens": 1757364290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6061278484253648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000463092942688152, + "kl": 0.0139007568359375, + "learning_rate": 8.061150274000954e-06, + "loss": 0.0006, + "num_tokens": 1757927506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6062985405820602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005848746941996061, + "kl": 0.0135345458984375, + "learning_rate": 8.055305788876423e-06, + "loss": 0.0005, + "num_tokens": 1758495666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6064692327387556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015528892951816795, + "kl": 0.013580322265625, + "learning_rate": 8.049461994048143e-06, + "loss": 0.0005, + "num_tokens": 1759056242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.606639924895451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022444771764745507, + "kl": 0.01275634765625, + "learning_rate": 8.04361889159044e-06, + "loss": 0.0005, + "num_tokens": 1759621986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6068106170521465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008788999229324496, + "kl": 0.013336181640625, + "learning_rate": 8.03777648357741e-06, + "loss": 0.0005, + "num_tokens": 1760187506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6069813092088419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043848536294960317, + "kl": 0.012939453125, + "learning_rate": 8.031934772082896e-06, + "loss": 0.0005, + "num_tokens": 1760752642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6071520013655373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004080325046778279, + "kl": 0.0140533447265625, + "learning_rate": 8.0260937591805e-06, + "loss": 0.0006, + "num_tokens": 1761317186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6073226935222327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013004356049010053, + "kl": 0.0136260986328125, + "learning_rate": 8.020253446943558e-06, + "loss": 0.0005, + "num_tokens": 1761881314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6074933856789281, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000909400461917625, + "kl": 0.0137481689453125, + "learning_rate": 8.014413837445181e-06, + "loss": 0.0006, + "num_tokens": 1762444850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6076640778356235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004215467781169171, + "kl": 0.014007568359375, + "learning_rate": 8.008574932758215e-06, + "loss": 0.0006, + "num_tokens": 1763017842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6078347699923189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004887145524880952, + "kl": 0.0142059326171875, + "learning_rate": 8.002736734955264e-06, + "loss": 0.0006, + "num_tokens": 1763589618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6080054621490143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013269916240643742, + "kl": 0.0137786865234375, + "learning_rate": 7.996899246108674e-06, + "loss": 0.0006, + "num_tokens": 1764150546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6081761543057097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017208178741172852, + "kl": 0.0139923095703125, + "learning_rate": 7.991062468290544e-06, + "loss": 0.0006, + "num_tokens": 1764713938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.608346846462405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019663273069219793, + "kl": 0.0131378173828125, + "learning_rate": 7.985226403572717e-06, + "loss": 0.0005, + "num_tokens": 1765275330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6085175386191004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001005695237653641, + "kl": 0.0140228271484375, + "learning_rate": 7.979391054026791e-06, + "loss": 0.0006, + "num_tokens": 1765842146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6086882307757958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035485150887397924, + "kl": 0.0129852294921875, + "learning_rate": 7.973556421724098e-06, + "loss": 0.0005, + "num_tokens": 1766407618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6088589229324912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037550808363854193, + "kl": 0.0134429931640625, + "learning_rate": 7.967722508735725e-06, + "loss": 0.0005, + "num_tokens": 1766977522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6090296150891866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010051417770067003, + "kl": 0.014129638671875, + "learning_rate": 7.961889317132502e-06, + "loss": 0.0006, + "num_tokens": 1767548834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.609200307245882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007451664546977427, + "kl": 0.0134429931640625, + "learning_rate": 7.956056848985e-06, + "loss": 0.0005, + "num_tokens": 1768116274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6093709994025774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002199896323113432, + "kl": 0.0139312744140625, + "learning_rate": 7.950225106363535e-06, + "loss": 0.0006, + "num_tokens": 1768680706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6095416915592728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002351993989824482, + "kl": 0.013641357421875, + "learning_rate": 7.944394091338163e-06, + "loss": 0.0005, + "num_tokens": 1769245186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6097123837159683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016734265419874113, + "kl": 0.0138702392578125, + "learning_rate": 7.938563805978684e-06, + "loss": 0.0006, + "num_tokens": 1769813586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6098830758726637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007922101438500496, + "kl": 0.0133819580078125, + "learning_rate": 7.932734252354646e-06, + "loss": 0.0005, + "num_tokens": 1770379362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6100537680293591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016636535053982198, + "kl": 0.0132904052734375, + "learning_rate": 7.92690543253532e-06, + "loss": 0.0005, + "num_tokens": 1770948290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6102244601860545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010866799989676216, + "kl": 0.0139923095703125, + "learning_rate": 7.92107734858973e-06, + "loss": 0.0006, + "num_tokens": 1771518594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6103951523427499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010472733415378113, + "kl": 0.013671875, + "learning_rate": 7.915250002586639e-06, + "loss": 0.0005, + "num_tokens": 1772080994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6105658444994453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003011871674620624, + "kl": 0.0137481689453125, + "learning_rate": 7.909423396594542e-06, + "loss": 0.0005, + "num_tokens": 1772649426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6107365366561407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015200835222524186, + "kl": 0.0130615234375, + "learning_rate": 7.903597532681672e-06, + "loss": 0.0005, + "num_tokens": 1773211378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6109072288128361, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006940610844998952, + "kl": 0.01300048828125, + "learning_rate": 7.897772412916003e-06, + "loss": 0.0005, + "num_tokens": 1773784130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6110779209695314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007172193264002301, + "kl": 0.0134429931640625, + "learning_rate": 7.89194803936524e-06, + "loss": 0.0005, + "num_tokens": 1774349618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6112486131262268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001709285491324516, + "kl": 0.0135040283203125, + "learning_rate": 7.886124414096832e-06, + "loss": 0.0005, + "num_tokens": 1774917170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6114193052829222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002917214153690628, + "kl": 0.012847900390625, + "learning_rate": 7.880301539177944e-06, + "loss": 0.0005, + "num_tokens": 1775479890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6115899974396176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004158918875326161, + "kl": 0.0133056640625, + "learning_rate": 7.874479416675495e-06, + "loss": 0.0005, + "num_tokens": 1776042050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.611760689596313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003517402966157796, + "kl": 0.0137939453125, + "learning_rate": 7.868658048656125e-06, + "loss": 0.0006, + "num_tokens": 1776602834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6119313817530084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000679024291651601, + "kl": 0.013671875, + "learning_rate": 7.862837437186216e-06, + "loss": 0.0005, + "num_tokens": 1777167554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6121020739097038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015821863330581008, + "kl": 0.0130767822265625, + "learning_rate": 7.857017584331865e-06, + "loss": 0.0005, + "num_tokens": 1777737362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6122727660663992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039128770565779954, + "kl": 0.0138092041015625, + "learning_rate": 7.851198492158914e-06, + "loss": 0.0006, + "num_tokens": 1778305858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6124434582230946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003949515523399723, + "kl": 0.0139617919921875, + "learning_rate": 7.84538016273293e-06, + "loss": 0.0006, + "num_tokens": 1778872194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6126141503797901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001300370024184365, + "kl": 0.01324462890625, + "learning_rate": 7.839562598119217e-06, + "loss": 0.0005, + "num_tokens": 1779444066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6127848425364855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005759834666754146, + "kl": 0.0141448974609375, + "learning_rate": 7.83374580038279e-06, + "loss": 0.0006, + "num_tokens": 1780014994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6129555346931809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014289312411744606, + "kl": 0.0129852294921875, + "learning_rate": 7.827929771588408e-06, + "loss": 0.0005, + "num_tokens": 1780581986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6131262268498763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010837713271224523, + "kl": 0.013092041015625, + "learning_rate": 7.822114513800546e-06, + "loss": 0.0005, + "num_tokens": 1781151330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6132969190065717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004205196938385835, + "kl": 0.0130767822265625, + "learning_rate": 7.81630002908342e-06, + "loss": 0.0005, + "num_tokens": 1781718594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6134676111632671, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018601843015239922, + "kl": 0.0126495361328125, + "learning_rate": 7.810486319500954e-06, + "loss": 0.0005, + "num_tokens": 1782282754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6136383033199625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007647146032608133, + "kl": 0.0126495361328125, + "learning_rate": 7.804673387116808e-06, + "loss": 0.0005, + "num_tokens": 1782859154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6138089954766578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001930389687442642, + "kl": 0.01336669921875, + "learning_rate": 7.79886123399436e-06, + "loss": 0.0005, + "num_tokens": 1783427794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6139796876333532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008007604164700574, + "kl": 0.01348876953125, + "learning_rate": 7.793049862196721e-06, + "loss": 0.0005, + "num_tokens": 1783990082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6141503797900486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021360219295918828, + "kl": 0.0132904052734375, + "learning_rate": 7.787239273786716e-06, + "loss": 0.0005, + "num_tokens": 1784558754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.614321071946744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010325394723903073, + "kl": 0.0138397216796875, + "learning_rate": 7.781429470826888e-06, + "loss": 0.0006, + "num_tokens": 1785128322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6144917641034394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019005498493604504, + "kl": 0.013214111328125, + "learning_rate": 7.775620455379514e-06, + "loss": 0.0005, + "num_tokens": 1785693874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6146624562601348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005713981239107672, + "kl": 0.0129852294921875, + "learning_rate": 7.769812229506587e-06, + "loss": 0.0005, + "num_tokens": 1786255298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6148331484168302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000397166255390212, + "kl": 0.01318359375, + "learning_rate": 7.764004795269808e-06, + "loss": 0.0005, + "num_tokens": 1786817330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6150038405735256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008037069322245543, + "kl": 0.013824462890625, + "learning_rate": 7.758198154730611e-06, + "loss": 0.0006, + "num_tokens": 1787383810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.615174532730221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008058902942822279, + "kl": 0.013214111328125, + "learning_rate": 7.752392309950147e-06, + "loss": 0.0005, + "num_tokens": 1787945298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6153452248869165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007953066516564254, + "kl": 0.0126953125, + "learning_rate": 7.746587262989282e-06, + "loss": 0.0005, + "num_tokens": 1788507986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6155159170436119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002764439536317462, + "kl": 0.01336669921875, + "learning_rate": 7.740783015908592e-06, + "loss": 0.0005, + "num_tokens": 1789070050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6156866092003073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004533800537031296, + "kl": 0.0135955810546875, + "learning_rate": 7.734979570768376e-06, + "loss": 0.0005, + "num_tokens": 1789638130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6158573013570027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010525030534215907, + "kl": 0.013214111328125, + "learning_rate": 7.729176929628653e-06, + "loss": 0.0005, + "num_tokens": 1790206946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6160279935136981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009093791403937895, + "kl": 0.01312255859375, + "learning_rate": 7.723375094549151e-06, + "loss": 0.0005, + "num_tokens": 1790770594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6161986856703935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007313299988338585, + "kl": 0.01300048828125, + "learning_rate": 7.717574067589302e-06, + "loss": 0.0005, + "num_tokens": 1791334562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6163693778270889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009693184291757353, + "kl": 0.0129547119140625, + "learning_rate": 7.711773850808272e-06, + "loss": 0.0005, + "num_tokens": 1791902738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6165400699837843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005965917215938268, + "kl": 0.0136871337890625, + "learning_rate": 7.705974446264925e-06, + "loss": 0.0005, + "num_tokens": 1792470274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6167107621404796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013695938455579365, + "kl": 0.0131988525390625, + "learning_rate": 7.70017585601784e-06, + "loss": 0.0005, + "num_tokens": 1793055522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.616881454297175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006435941758529442, + "kl": 0.0132904052734375, + "learning_rate": 7.694378082125304e-06, + "loss": 0.0005, + "num_tokens": 1793619794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6170521464538704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018515222283312335, + "kl": 0.0126495361328125, + "learning_rate": 7.688581126645325e-06, + "loss": 0.0005, + "num_tokens": 1794188418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6172228386105658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009120818146089347, + "kl": 0.01275634765625, + "learning_rate": 7.682784991635603e-06, + "loss": 0.0005, + "num_tokens": 1794756818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6173935307672612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008230292437151922, + "kl": 0.0128021240234375, + "learning_rate": 7.676989679153571e-06, + "loss": 0.0005, + "num_tokens": 1795321826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6175642229239566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006593828751242101, + "kl": 0.012664794921875, + "learning_rate": 7.67119519125634e-06, + "loss": 0.0005, + "num_tokens": 1795883394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.617734915080652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004923051457825414, + "kl": 0.012786865234375, + "learning_rate": 7.665401530000754e-06, + "loss": 0.0005, + "num_tokens": 1796453042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6179056072373474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010559049534676936, + "kl": 0.01385498046875, + "learning_rate": 7.659608697443348e-06, + "loss": 0.0006, + "num_tokens": 1797015442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6180762993940428, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.739836367680919e-05, + "kl": 0.0130157470703125, + "learning_rate": 7.653816695640375e-06, + "loss": 0.0005, + "num_tokens": 1797580162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6182469915507383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001196515842974031, + "kl": 0.013580322265625, + "learning_rate": 7.64802552664778e-06, + "loss": 0.0005, + "num_tokens": 1798146610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6184176837074337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029151926437161977, + "kl": 0.0129547119140625, + "learning_rate": 7.642235192521224e-06, + "loss": 0.0005, + "num_tokens": 1798708226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6185883758641291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003707457499531167, + "kl": 0.0124664306640625, + "learning_rate": 7.63644569531606e-06, + "loss": 0.0005, + "num_tokens": 1799272114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6187590680208245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008300195238961939, + "kl": 0.013641357421875, + "learning_rate": 7.630657037087358e-06, + "loss": 0.0005, + "num_tokens": 1799835026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6189297601775199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039220363080865356, + "kl": 0.01300048828125, + "learning_rate": 7.62486921988988e-06, + "loss": 0.0005, + "num_tokens": 1800399010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6191004523342153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004032226141400451, + "kl": 0.0130157470703125, + "learning_rate": 7.619082245778089e-06, + "loss": 0.0005, + "num_tokens": 1800964674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6192711444909107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023423736634413895, + "kl": 0.01318359375, + "learning_rate": 7.6132961168061555e-06, + "loss": 0.0005, + "num_tokens": 1801529730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.619441836647606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007691296185744967, + "kl": 0.0128173828125, + "learning_rate": 7.607510835027948e-06, + "loss": 0.0005, + "num_tokens": 1802095874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6196125288043014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024342616485129465, + "kl": 0.013580322265625, + "learning_rate": 7.601726402497028e-06, + "loss": 0.0005, + "num_tokens": 1802662962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6197832209609968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004283003130837903, + "kl": 0.01263427734375, + "learning_rate": 7.595942821266661e-06, + "loss": 0.0005, + "num_tokens": 1803226514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6199539131176922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005624275983916239, + "kl": 0.0122528076171875, + "learning_rate": 7.590160093389812e-06, + "loss": 0.0005, + "num_tokens": 1803791570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6201246052743876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006298476675515752, + "kl": 0.013763427734375, + "learning_rate": 7.584378220919143e-06, + "loss": 0.0005, + "num_tokens": 1804371042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.620295297431083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021008573053830757, + "kl": 0.0136566162109375, + "learning_rate": 7.578597205907003e-06, + "loss": 0.0005, + "num_tokens": 1804940338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6204659895877784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007161032982935776, + "kl": 0.0123291015625, + "learning_rate": 7.572817050405446e-06, + "loss": 0.0005, + "num_tokens": 1805510034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6206366817444738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000808823063135577, + "kl": 0.0132904052734375, + "learning_rate": 7.567037756466222e-06, + "loss": 0.0005, + "num_tokens": 1806078866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6208073739011692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001022153818219509, + "kl": 0.0134735107421875, + "learning_rate": 7.561259326140773e-06, + "loss": 0.0005, + "num_tokens": 1806647234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6209780660578647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006061922779138473, + "kl": 0.0127410888671875, + "learning_rate": 7.555481761480224e-06, + "loss": 0.0005, + "num_tokens": 1807214834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6211487582145601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018673914870213763, + "kl": 0.0131988525390625, + "learning_rate": 7.54970506453541e-06, + "loss": 0.0005, + "num_tokens": 1807777346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6213194503712555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009705367555255281, + "kl": 0.012847900390625, + "learning_rate": 7.5439292373568465e-06, + "loss": 0.0005, + "num_tokens": 1808345266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6214901425279509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015820605830746122, + "kl": 0.013214111328125, + "learning_rate": 7.5381542819947485e-06, + "loss": 0.0005, + "num_tokens": 1808917538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6216608346846463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008036021982882425, + "kl": 0.0134429931640625, + "learning_rate": 7.532380200499008e-06, + "loss": 0.0005, + "num_tokens": 1809485330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6218315268413417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004760978406925711, + "kl": 0.0127105712890625, + "learning_rate": 7.526606994919222e-06, + "loss": 0.0005, + "num_tokens": 1810054082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6220022189980371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030018556797345544, + "kl": 0.0136871337890625, + "learning_rate": 7.520834667304668e-06, + "loss": 0.0005, + "num_tokens": 1810619970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6221729111547324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008635277889086489, + "kl": 0.0130767822265625, + "learning_rate": 7.515063219704318e-06, + "loss": 0.0005, + "num_tokens": 1811185138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6223436033114278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028782113247892305, + "kl": 0.0135498046875, + "learning_rate": 7.509292654166819e-06, + "loss": 0.0005, + "num_tokens": 1811753650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6225142954681232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001450124504268395, + "kl": 0.01275634765625, + "learning_rate": 7.50352297274052e-06, + "loss": 0.0005, + "num_tokens": 1812319682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6226849876248186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006579220172102004, + "kl": 0.0127105712890625, + "learning_rate": 7.497754177473446e-06, + "loss": 0.0005, + "num_tokens": 1812882610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.622855679781514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005734786314267629, + "kl": 0.0134429931640625, + "learning_rate": 7.491986270413321e-06, + "loss": 0.0005, + "num_tokens": 1813446546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6230263719382094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009393123611210741, + "kl": 0.0130462646484375, + "learning_rate": 7.4862192536075275e-06, + "loss": 0.0005, + "num_tokens": 1814007666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6231970640949048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006688358112006214, + "kl": 0.0132598876953125, + "learning_rate": 7.4804531291031605e-06, + "loss": 0.0005, + "num_tokens": 1814574658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6233677562516002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036163385955724676, + "kl": 0.01312255859375, + "learning_rate": 7.474687898946979e-06, + "loss": 0.0005, + "num_tokens": 1815138322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6235384484082956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007360004171954123, + "kl": 0.0129852294921875, + "learning_rate": 7.46892356518544e-06, + "loss": 0.0005, + "num_tokens": 1815699842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.623709140564991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004506531620129003, + "kl": 0.0129852294921875, + "learning_rate": 7.463160129864666e-06, + "loss": 0.0005, + "num_tokens": 1816271042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6238798327216865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000572984684800487, + "kl": 0.012298583984375, + "learning_rate": 7.457397595030471e-06, + "loss": 0.0005, + "num_tokens": 1816836290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6240505248783819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020250533169262645, + "kl": 0.013214111328125, + "learning_rate": 7.451635962728346e-06, + "loss": 0.0005, + "num_tokens": 1817406834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6242212170350773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011303671719341594, + "kl": 0.01300048828125, + "learning_rate": 7.445875235003465e-06, + "loss": 0.0005, + "num_tokens": 1817968178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6243919091917727, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011819802765204105, + "kl": 0.0123443603515625, + "learning_rate": 7.440115413900678e-06, + "loss": 0.0005, + "num_tokens": 1818536658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6245626013484681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023590374706334362, + "kl": 0.0137786865234375, + "learning_rate": 7.43435650146451e-06, + "loss": 0.0006, + "num_tokens": 1819105490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6247332935051635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019055501321997968, + "kl": 0.01263427734375, + "learning_rate": 7.42859849973917e-06, + "loss": 0.0005, + "num_tokens": 1819667602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6249039856618588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003373167945123608, + "kl": 0.01373291015625, + "learning_rate": 7.422841410768544e-06, + "loss": 0.0005, + "num_tokens": 1820239266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6250746778185542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005859734166233874, + "kl": 0.0126800537109375, + "learning_rate": 7.417085236596184e-06, + "loss": 0.0005, + "num_tokens": 1820804946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6252453699752496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016663389255803694, + "kl": 0.01324462890625, + "learning_rate": 7.4113299792653255e-06, + "loss": 0.0005, + "num_tokens": 1821373906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.625416062131945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005017082368762287, + "kl": 0.0137939453125, + "learning_rate": 7.405575640818881e-06, + "loss": 0.0006, + "num_tokens": 1821942130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6255867542886404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024149412127032107, + "kl": 0.013092041015625, + "learning_rate": 7.3998222232994335e-06, + "loss": 0.0005, + "num_tokens": 1822509250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6257574464453358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021754184122199253, + "kl": 0.013916015625, + "learning_rate": 7.394069728749232e-06, + "loss": 0.0006, + "num_tokens": 1823075890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6259281386020312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006602706930350168, + "kl": 0.013092041015625, + "learning_rate": 7.388318159210207e-06, + "loss": 0.0005, + "num_tokens": 1823640994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6260988307587266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019358055135184374, + "kl": 0.01300048828125, + "learning_rate": 7.382567516723962e-06, + "loss": 0.0005, + "num_tokens": 1824207906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.626269522915422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008309919914322664, + "kl": 0.013336181640625, + "learning_rate": 7.376817803331765e-06, + "loss": 0.0005, + "num_tokens": 1824772258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6264402150721174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006942106309166671, + "kl": 0.0127410888671875, + "learning_rate": 7.371069021074553e-06, + "loss": 0.0005, + "num_tokens": 1825339538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6266109072288129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005492371011786177, + "kl": 0.0129547119140625, + "learning_rate": 7.3653211719929406e-06, + "loss": 0.0005, + "num_tokens": 1825914786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6267815993855083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01426550794978791, + "kl": 0.0146484375, + "learning_rate": 7.359574258127208e-06, + "loss": 0.0006, + "num_tokens": 1826479378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6269522915422037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006134412408227319, + "kl": 0.012847900390625, + "learning_rate": 7.3538282815173e-06, + "loss": 0.0005, + "num_tokens": 1827042994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6271229836988991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023593400774390757, + "kl": 0.01312255859375, + "learning_rate": 7.348083244202829e-06, + "loss": 0.0005, + "num_tokens": 1827606194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6272936758555945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037158555599596814, + "kl": 0.013641357421875, + "learning_rate": 7.342339148223076e-06, + "loss": 0.0005, + "num_tokens": 1828165906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6274643680122899, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020088700143494234, + "kl": 0.013427734375, + "learning_rate": 7.33659599561699e-06, + "loss": 0.0005, + "num_tokens": 1828735330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6276350601689852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000736333776916882, + "kl": 0.0138702392578125, + "learning_rate": 7.330853788423187e-06, + "loss": 0.0006, + "num_tokens": 1829300514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6278057523256806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009176744202008916, + "kl": 0.013671875, + "learning_rate": 7.325112528679933e-06, + "loss": 0.0005, + "num_tokens": 1829866130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.627976444482376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006969793951503003, + "kl": 0.0131683349609375, + "learning_rate": 7.319372218425176e-06, + "loss": 0.0005, + "num_tokens": 1830425634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6281471366390714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009491076674162733, + "kl": 0.0139923095703125, + "learning_rate": 7.313632859696513e-06, + "loss": 0.0006, + "num_tokens": 1830989810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6283178287957668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039652772044870263, + "kl": 0.0141448974609375, + "learning_rate": 7.307894454531217e-06, + "loss": 0.0006, + "num_tokens": 1831560034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6284885209524622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003500719100298624, + "kl": 0.013641357421875, + "learning_rate": 7.302157004966208e-06, + "loss": 0.0005, + "num_tokens": 1832125122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6286592131091576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001032069837936386, + "kl": 0.01348876953125, + "learning_rate": 7.296420513038077e-06, + "loss": 0.0005, + "num_tokens": 1832694434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.628829905265853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014448525500221558, + "kl": 0.01312255859375, + "learning_rate": 7.290684980783067e-06, + "loss": 0.0005, + "num_tokens": 1833256178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6290005974225484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006262428331143754, + "kl": 0.0133514404296875, + "learning_rate": 7.284950410237093e-06, + "loss": 0.0005, + "num_tokens": 1833815922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6291712895792438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016413988225617117, + "kl": 0.013641357421875, + "learning_rate": 7.2792168034357135e-06, + "loss": 0.0005, + "num_tokens": 1834378114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6293419817359392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002884769409998335, + "kl": 0.0143890380859375, + "learning_rate": 7.2734841624141515e-06, + "loss": 0.0006, + "num_tokens": 1834940322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6295126738926347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002207376651550373, + "kl": 0.013641357421875, + "learning_rate": 7.2677524892072936e-06, + "loss": 0.0005, + "num_tokens": 1835508306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6296833660493301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019207833891698638, + "kl": 0.0134735107421875, + "learning_rate": 7.262021785849673e-06, + "loss": 0.0005, + "num_tokens": 1836071650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6298540582060255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012078350736327863, + "kl": 0.0135650634765625, + "learning_rate": 7.256292054375484e-06, + "loss": 0.0005, + "num_tokens": 1836634434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6300247503627209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005285356218168411, + "kl": 0.01422119140625, + "learning_rate": 7.2505632968185705e-06, + "loss": 0.0006, + "num_tokens": 1837200002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6301954425194163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017271701628948312, + "kl": 0.0137176513671875, + "learning_rate": 7.244835515212438e-06, + "loss": 0.0005, + "num_tokens": 1837769602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6303661346761116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014165632958872457, + "kl": 0.0141448974609375, + "learning_rate": 7.239108711590246e-06, + "loss": 0.0006, + "num_tokens": 1838334274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.630536826832807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006760348404794732, + "kl": 0.01312255859375, + "learning_rate": 7.2333828879847964e-06, + "loss": 0.0005, + "num_tokens": 1838895698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6307075189895024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018131982115718872, + "kl": 0.0136566162109375, + "learning_rate": 7.22765804642855e-06, + "loss": 0.0005, + "num_tokens": 1839459010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6308782111461978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021007673461950534, + "kl": 0.0132293701171875, + "learning_rate": 7.221934188953623e-06, + "loss": 0.0005, + "num_tokens": 1840023346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6310489033028932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000343658113815508, + "kl": 0.0136566162109375, + "learning_rate": 7.21621131759178e-06, + "loss": 0.0005, + "num_tokens": 1840588834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6312195954595886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004675004751370012, + "kl": 0.01416015625, + "learning_rate": 7.210489434374428e-06, + "loss": 0.0006, + "num_tokens": 1841156658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.631390287616284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009887904397074868, + "kl": 0.01397705078125, + "learning_rate": 7.2047685413326275e-06, + "loss": 0.0006, + "num_tokens": 1841730306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6315609797729794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001649402798875904, + "kl": 0.013092041015625, + "learning_rate": 7.199048640497097e-06, + "loss": 0.0005, + "num_tokens": 1842307090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6317316719296748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002960260824437389, + "kl": 0.0128936767578125, + "learning_rate": 7.193329733898191e-06, + "loss": 0.0005, + "num_tokens": 1842884930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6319023640863702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004266089595446086, + "kl": 0.0134735107421875, + "learning_rate": 7.187611823565911e-06, + "loss": 0.0005, + "num_tokens": 1843448018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6320730562430656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007710700259103903, + "kl": 0.0131683349609375, + "learning_rate": 7.1818949115299145e-06, + "loss": 0.0005, + "num_tokens": 1844012322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.632243748399761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003770698281983299, + "kl": 0.012969970703125, + "learning_rate": 7.1761789998194965e-06, + "loss": 0.0005, + "num_tokens": 1844577986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6324144405564565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001203182707056795, + "kl": 0.012359619140625, + "learning_rate": 7.1704640904636e-06, + "loss": 0.0005, + "num_tokens": 1845158114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6325851327131519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010513276464573535, + "kl": 0.0130615234375, + "learning_rate": 7.1647501854908095e-06, + "loss": 0.0005, + "num_tokens": 1845722210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6327558248698473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004212944384072147, + "kl": 0.0128631591796875, + "learning_rate": 7.1590372869293564e-06, + "loss": 0.0005, + "num_tokens": 1846286002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6329265170265427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016437751856941477, + "kl": 0.0131072998046875, + "learning_rate": 7.153325396807112e-06, + "loss": 0.0005, + "num_tokens": 1846850802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6330972091832381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016706093393219207, + "kl": 0.0133819580078125, + "learning_rate": 7.1476145171515994e-06, + "loss": 0.0005, + "num_tokens": 1847415602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6332679013399334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009495375770739755, + "kl": 0.01336669921875, + "learning_rate": 7.1419046499899626e-06, + "loss": 0.0005, + "num_tokens": 1847982226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6334385934966288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016073867723799277, + "kl": 0.01287841796875, + "learning_rate": 7.136195797349005e-06, + "loss": 0.0005, + "num_tokens": 1848543922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6336092856533242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015468745976729295, + "kl": 0.013671875, + "learning_rate": 7.130487961255159e-06, + "loss": 0.0005, + "num_tokens": 1849111074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6337799778100196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007419053562720521, + "kl": 0.0129547119140625, + "learning_rate": 7.1247811437345095e-06, + "loss": 0.0005, + "num_tokens": 1849680770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.633950669966715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001263162044135874, + "kl": 0.01397705078125, + "learning_rate": 7.119075346812761e-06, + "loss": 0.0006, + "num_tokens": 1850246306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6341213621234104, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1014293087319111, + "kl": 0.1480712890625, + "learning_rate": 7.113370572515269e-06, + "loss": 0.0059, + "num_tokens": 1850836354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6342920542801058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045398105822168784, + "kl": 0.012542724609375, + "learning_rate": 7.107666822867021e-06, + "loss": 0.0005, + "num_tokens": 1851401250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6344627464368012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018346476305356344, + "kl": 0.0131378173828125, + "learning_rate": 7.101964099892647e-06, + "loss": 0.0005, + "num_tokens": 1851966754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6346334385934966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030503345890651616, + "kl": 0.0131072998046875, + "learning_rate": 7.096262405616403e-06, + "loss": 0.0005, + "num_tokens": 1852530818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.634804130750192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003979283619859762, + "kl": 0.0131683349609375, + "learning_rate": 7.090561742062183e-06, + "loss": 0.0005, + "num_tokens": 1853099970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6349748229068874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033982921537168694, + "kl": 0.0132904052734375, + "learning_rate": 7.084862111253523e-06, + "loss": 0.0005, + "num_tokens": 1853666402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6351455150635829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005301177269817785, + "kl": 0.013397216796875, + "learning_rate": 7.079163515213585e-06, + "loss": 0.0005, + "num_tokens": 1854239778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6353162072202783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019342559537412964, + "kl": 0.0133514404296875, + "learning_rate": 7.0734659559651606e-06, + "loss": 0.0005, + "num_tokens": 1854808466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6354868993769737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005252994503097877, + "kl": 0.0128631591796875, + "learning_rate": 7.067769435530678e-06, + "loss": 0.0005, + "num_tokens": 1855374770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6356575915336691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016210124101482862, + "kl": 0.0134429931640625, + "learning_rate": 7.062073955932201e-06, + "loss": 0.0005, + "num_tokens": 1855937794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6358282836903645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007938559018365016, + "kl": 0.013427734375, + "learning_rate": 7.056379519191418e-06, + "loss": 0.0005, + "num_tokens": 1856506258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6359989758470598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000255414543404338, + "kl": 0.0138702392578125, + "learning_rate": 7.050686127329644e-06, + "loss": 0.0006, + "num_tokens": 1857068994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6361696680037552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039890848336169416, + "kl": 0.013580322265625, + "learning_rate": 7.044993782367831e-06, + "loss": 0.0005, + "num_tokens": 1857636114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6363403601604506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002155171679535459, + "kl": 0.013916015625, + "learning_rate": 7.039302486326556e-06, + "loss": 0.0006, + "num_tokens": 1858200514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.636511052317146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014824454401836256, + "kl": 0.012786865234375, + "learning_rate": 7.033612241226026e-06, + "loss": 0.0005, + "num_tokens": 1858778818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6366817444738414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002188158347406144, + "kl": 0.0125579833984375, + "learning_rate": 7.027923049086066e-06, + "loss": 0.0005, + "num_tokens": 1859346098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6368524366305368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012479448969280392, + "kl": 0.0136260986328125, + "learning_rate": 7.02223491192614e-06, + "loss": 0.0005, + "num_tokens": 1859911266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6370231287872322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014874157236593334, + "kl": 0.01348876953125, + "learning_rate": 7.016547831765328e-06, + "loss": 0.0005, + "num_tokens": 1860475010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6371938209439276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013666501219255852, + "kl": 0.0130767822265625, + "learning_rate": 7.0108618106223425e-06, + "loss": 0.0005, + "num_tokens": 1861040162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.637364513100623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006022513161254601, + "kl": 0.0138702392578125, + "learning_rate": 7.005176850515507e-06, + "loss": 0.0006, + "num_tokens": 1861605234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6375352052573184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010693139058252546, + "kl": 0.013214111328125, + "learning_rate": 6.999492953462786e-06, + "loss": 0.0005, + "num_tokens": 1862167202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6377058974140138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014260366033955972, + "kl": 0.0129547119140625, + "learning_rate": 6.993810121481752e-06, + "loss": 0.0005, + "num_tokens": 1862733954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6378765895707093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018720235213881512, + "kl": 0.012451171875, + "learning_rate": 6.988128356589614e-06, + "loss": 0.0005, + "num_tokens": 1863304738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6380472817274047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006452721328784806, + "kl": 0.0134735107421875, + "learning_rate": 6.98244766080318e-06, + "loss": 0.0005, + "num_tokens": 1863866226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6382179738841001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013777367498590525, + "kl": 0.0133819580078125, + "learning_rate": 6.976768036138902e-06, + "loss": 0.0005, + "num_tokens": 1864439890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6383886660407955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007850227966789342, + "kl": 0.0132293701171875, + "learning_rate": 6.971089484612836e-06, + "loss": 0.0005, + "num_tokens": 1865015970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6385593581974909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001132128712006634, + "kl": 0.012542724609375, + "learning_rate": 6.965412008240673e-06, + "loss": 0.0005, + "num_tokens": 1865583842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6387300503541862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005466899621427983, + "kl": 0.013092041015625, + "learning_rate": 6.959735609037699e-06, + "loss": 0.0005, + "num_tokens": 1866151138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6389007425108816, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010837899763109712, + "kl": 0.0130615234375, + "learning_rate": 6.954060289018836e-06, + "loss": 0.0005, + "num_tokens": 1866712930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.639071434667577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001273024820774048, + "kl": 0.0138397216796875, + "learning_rate": 6.9483860501986185e-06, + "loss": 0.0006, + "num_tokens": 1867279970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6392421268242724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024513001269507815, + "kl": 0.0124969482421875, + "learning_rate": 6.942712894591199e-06, + "loss": 0.0005, + "num_tokens": 1867841426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6394128189809678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041525902021294527, + "kl": 0.013153076171875, + "learning_rate": 6.937040824210339e-06, + "loss": 0.0005, + "num_tokens": 1868404354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6395835111376632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031364983724576094, + "kl": 0.01287841796875, + "learning_rate": 6.93136984106942e-06, + "loss": 0.0005, + "num_tokens": 1868993026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6397542032943586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003256537912878783, + "kl": 0.0127716064453125, + "learning_rate": 6.925699947181433e-06, + "loss": 0.0005, + "num_tokens": 1869558018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.639924895451054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003736179939836461, + "kl": 0.0133514404296875, + "learning_rate": 6.9200311445589944e-06, + "loss": 0.0005, + "num_tokens": 1870130898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6400955876077494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001071954945707776, + "kl": 0.0127716064453125, + "learning_rate": 6.914363435214316e-06, + "loss": 0.0005, + "num_tokens": 1870698226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6402662797644448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023978280144912944, + "kl": 0.0132904052734375, + "learning_rate": 6.908696821159231e-06, + "loss": 0.0005, + "num_tokens": 1871262290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6404369719211402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003968675272888688, + "kl": 0.0132904052734375, + "learning_rate": 6.9030313044051925e-06, + "loss": 0.0005, + "num_tokens": 1871822962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6406076640778356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020159665284160343, + "kl": 0.0126953125, + "learning_rate": 6.89736688696324e-06, + "loss": 0.0005, + "num_tokens": 1872388354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.640778356234531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006891628750328975, + "kl": 0.0132598876953125, + "learning_rate": 6.891703570844044e-06, + "loss": 0.0005, + "num_tokens": 1872953602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6409490483912265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000836199697828935, + "kl": 0.0132904052734375, + "learning_rate": 6.886041358057877e-06, + "loss": 0.0005, + "num_tokens": 1873518866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6411197405479219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016477833911275356, + "kl": 0.0132293701171875, + "learning_rate": 6.880380250614624e-06, + "loss": 0.0005, + "num_tokens": 1874084370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6412904327046173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025589428667550956, + "kl": 0.013458251953125, + "learning_rate": 6.874720250523769e-06, + "loss": 0.0005, + "num_tokens": 1874648210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6414611248613126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014574904461313832, + "kl": 0.0131988525390625, + "learning_rate": 6.869061359794408e-06, + "loss": 0.0005, + "num_tokens": 1875206578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.641631817018008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040952791822586424, + "kl": 0.012664794921875, + "learning_rate": 6.863403580435242e-06, + "loss": 0.0005, + "num_tokens": 1875772226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6418025091747034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006214358832579869, + "kl": 0.01263427734375, + "learning_rate": 6.857746914454585e-06, + "loss": 0.0005, + "num_tokens": 1876346210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6419732013313988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010286332478679971, + "kl": 0.01312255859375, + "learning_rate": 6.8520913638603425e-06, + "loss": 0.0005, + "num_tokens": 1876912290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6421438934880942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002692331482661092, + "kl": 0.0130615234375, + "learning_rate": 6.846436930660029e-06, + "loss": 0.0005, + "num_tokens": 1877477314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6423145856447896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018086191588675866, + "kl": 0.0126953125, + "learning_rate": 6.840783616860772e-06, + "loss": 0.0005, + "num_tokens": 1878046578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.642485277801485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007811895848628274, + "kl": 0.0125732421875, + "learning_rate": 6.835131424469292e-06, + "loss": 0.0005, + "num_tokens": 1878612546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6426559699581804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029335218882881015, + "kl": 0.0123291015625, + "learning_rate": 6.829480355491909e-06, + "loss": 0.0005, + "num_tokens": 1879181906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6428266621148758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013480398875345132, + "kl": 0.012664794921875, + "learning_rate": 6.823830411934547e-06, + "loss": 0.0005, + "num_tokens": 1879748146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6429973542715712, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002584531020240764, + "kl": 0.0128173828125, + "learning_rate": 6.818181595802739e-06, + "loss": 0.0005, + "num_tokens": 1880312258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6431680464282666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024876681718947846, + "kl": 0.01361083984375, + "learning_rate": 6.812533909101609e-06, + "loss": 0.0005, + "num_tokens": 1880880226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.643338738584962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003186417156922676, + "kl": 0.0133056640625, + "learning_rate": 6.8068873538358785e-06, + "loss": 0.0005, + "num_tokens": 1881450786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6435094307416575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010921596834195556, + "kl": 0.0125732421875, + "learning_rate": 6.80124193200987e-06, + "loss": 0.0005, + "num_tokens": 1882016146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6436801228983529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034792239648642515, + "kl": 0.013092041015625, + "learning_rate": 6.7955976456275096e-06, + "loss": 0.0005, + "num_tokens": 1882581938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6438508150550483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001043908870729747, + "kl": 0.0125579833984375, + "learning_rate": 6.789954496692316e-06, + "loss": 0.0005, + "num_tokens": 1883146626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6440215072117437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008972748332329657, + "kl": 0.012969970703125, + "learning_rate": 6.784312487207394e-06, + "loss": 0.0005, + "num_tokens": 1883714146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.644192199368439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026566111490928623, + "kl": 0.0127105712890625, + "learning_rate": 6.778671619175463e-06, + "loss": 0.0005, + "num_tokens": 1884285026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6443628915251344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000589969854132526, + "kl": 0.012847900390625, + "learning_rate": 6.773031894598823e-06, + "loss": 0.0005, + "num_tokens": 1884850178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6445335836818298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020820084375280498, + "kl": 0.0130767822265625, + "learning_rate": 6.767393315479376e-06, + "loss": 0.0005, + "num_tokens": 1885413058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6447042758385252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009722726333060707, + "kl": 0.013031005859375, + "learning_rate": 6.761755883818608e-06, + "loss": 0.0005, + "num_tokens": 1885980770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6448749679952206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003991722810916687, + "kl": 0.013458251953125, + "learning_rate": 6.756119601617608e-06, + "loss": 0.0005, + "num_tokens": 1886544978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.645045660151916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002668813900198594, + "kl": 0.0135040283203125, + "learning_rate": 6.7504844708770516e-06, + "loss": 0.0005, + "num_tokens": 1887107522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6452163523086114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036652542954401838, + "kl": 0.01361083984375, + "learning_rate": 6.744850493597213e-06, + "loss": 0.0005, + "num_tokens": 1887669970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6453870444653068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003346546233059101, + "kl": 0.0123138427734375, + "learning_rate": 6.739217671777938e-06, + "loss": 0.0005, + "num_tokens": 1888237026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6455577366220022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000312053541270764, + "kl": 0.01263427734375, + "learning_rate": 6.733586007418684e-06, + "loss": 0.0005, + "num_tokens": 1888802002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6457284287786976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00069405621466515, + "kl": 0.0126953125, + "learning_rate": 6.727955502518485e-06, + "loss": 0.0005, + "num_tokens": 1889366466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.645899120935393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000567835113550551, + "kl": 0.01275634765625, + "learning_rate": 6.7223261590759735e-06, + "loss": 0.0005, + "num_tokens": 1889934594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6460698130920884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00090213868821965, + "kl": 0.0133056640625, + "learning_rate": 6.716697979089355e-06, + "loss": 0.0005, + "num_tokens": 1890506594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6462405052487838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005303667664624918, + "kl": 0.0129241943359375, + "learning_rate": 6.711070964556434e-06, + "loss": 0.0005, + "num_tokens": 1891072050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6464111974054793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045022030247871057, + "kl": 0.012969970703125, + "learning_rate": 6.7054451174745945e-06, + "loss": 0.0005, + "num_tokens": 1891635586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6465818895621747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006040541509669988, + "kl": 0.0127716064453125, + "learning_rate": 6.699820439840817e-06, + "loss": 0.0005, + "num_tokens": 1892199538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6467525817188701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005028370707540219, + "kl": 0.0126190185546875, + "learning_rate": 6.694196933651651e-06, + "loss": 0.0005, + "num_tokens": 1892767410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6469232738755654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027035202200603574, + "kl": 0.0128936767578125, + "learning_rate": 6.688574600903241e-06, + "loss": 0.0005, + "num_tokens": 1893334578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6470939660322608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000746977369258791, + "kl": 0.0130767822265625, + "learning_rate": 6.68295344359131e-06, + "loss": 0.0005, + "num_tokens": 1893911314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6472646581889562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012308380854021144, + "kl": 0.0126190185546875, + "learning_rate": 6.677333463711172e-06, + "loss": 0.0005, + "num_tokens": 1894483298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6474353503456516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006664528988976813, + "kl": 0.0132904052734375, + "learning_rate": 6.671714663257711e-06, + "loss": 0.0005, + "num_tokens": 1895047922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.647606042502347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048303208570594586, + "kl": 0.0132293701171875, + "learning_rate": 6.666097044225399e-06, + "loss": 0.0005, + "num_tokens": 1895615330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6477767346590424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014046675047992864, + "kl": 0.0135498046875, + "learning_rate": 6.660480608608291e-06, + "loss": 0.0005, + "num_tokens": 1896179426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6479474268157378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008827896655560147, + "kl": 0.013519287109375, + "learning_rate": 6.654865358400018e-06, + "loss": 0.0005, + "num_tokens": 1896744642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6481181189724332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006794599149984405, + "kl": 0.0128631591796875, + "learning_rate": 6.64925129559379e-06, + "loss": 0.0005, + "num_tokens": 1897310514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6482888111291286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007209791395888131, + "kl": 0.012786865234375, + "learning_rate": 6.643638422182393e-06, + "loss": 0.0005, + "num_tokens": 1897875538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.648459503285824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004560352997614785, + "kl": 0.0134735107421875, + "learning_rate": 6.638026740158202e-06, + "loss": 0.0005, + "num_tokens": 1898444450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6486301954425194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007955262174281838, + "kl": 0.0125885009765625, + "learning_rate": 6.6324162515131605e-06, + "loss": 0.0005, + "num_tokens": 1899010258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6488008875992148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005402352522601999, + "kl": 0.0130767822265625, + "learning_rate": 6.626806958238784e-06, + "loss": 0.0005, + "num_tokens": 1899575314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6489715797559102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018757863406651456, + "kl": 0.01397705078125, + "learning_rate": 6.62119886232617e-06, + "loss": 0.0006, + "num_tokens": 1900140322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6491422719126057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006116219512655072, + "kl": 0.0132598876953125, + "learning_rate": 6.615591965765994e-06, + "loss": 0.0005, + "num_tokens": 1900710690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6493129640693011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006847559458023512, + "kl": 0.013275146484375, + "learning_rate": 6.609986270548503e-06, + "loss": 0.0005, + "num_tokens": 1901271938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6494836562259965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003386149327009736, + "kl": 0.0121307373046875, + "learning_rate": 6.604381778663509e-06, + "loss": 0.0005, + "num_tokens": 1901836914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6496543483826919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040299584072318894, + "kl": 0.013153076171875, + "learning_rate": 6.59877849210041e-06, + "loss": 0.0005, + "num_tokens": 1902405730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6498250405393872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016267352916696034, + "kl": 0.013336181640625, + "learning_rate": 6.59317641284817e-06, + "loss": 0.0005, + "num_tokens": 1902971666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6499957326960826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011048156888391369, + "kl": 0.0129852294921875, + "learning_rate": 6.5875755428953255e-06, + "loss": 0.0005, + "num_tokens": 1903539266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.650166424852778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004031524128558535, + "kl": 0.0127410888671875, + "learning_rate": 6.581975884229979e-06, + "loss": 0.0005, + "num_tokens": 1904108114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6503371170094734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005969245259068063, + "kl": 0.0131378173828125, + "learning_rate": 6.576377438839812e-06, + "loss": 0.0005, + "num_tokens": 1904671842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6505078091661688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007087405990583385, + "kl": 0.0123748779296875, + "learning_rate": 6.570780208712064e-06, + "loss": 0.0005, + "num_tokens": 1905234130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6506785013228642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004210659825454693, + "kl": 0.012725830078125, + "learning_rate": 6.565184195833562e-06, + "loss": 0.0005, + "num_tokens": 1905803842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6508491934795596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020099972749711362, + "kl": 0.012298583984375, + "learning_rate": 6.559589402190676e-06, + "loss": 0.0005, + "num_tokens": 1906370434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.651019885636255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004268749542549844, + "kl": 0.013153076171875, + "learning_rate": 6.553995829769362e-06, + "loss": 0.0005, + "num_tokens": 1906936354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6511905777929504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006750582793790338, + "kl": 0.0132598876953125, + "learning_rate": 6.5484034805551325e-06, + "loss": 0.0005, + "num_tokens": 1907499138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6513612699496458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027350247462410457, + "kl": 0.012786865234375, + "learning_rate": 6.542812356533075e-06, + "loss": 0.0005, + "num_tokens": 1908062578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6515319621063412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004796853456858803, + "kl": 0.0133209228515625, + "learning_rate": 6.537222459687832e-06, + "loss": 0.0005, + "num_tokens": 1908641090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6517026542630366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012288611430769443, + "kl": 0.012939453125, + "learning_rate": 6.531633792003618e-06, + "loss": 0.0005, + "num_tokens": 1909204242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.651873346419732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046129207688117973, + "kl": 0.0124359130859375, + "learning_rate": 6.526046355464203e-06, + "loss": 0.0005, + "num_tokens": 1909768786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6520440385764275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005460239933612889, + "kl": 0.0127716064453125, + "learning_rate": 6.5204601520529344e-06, + "loss": 0.0005, + "num_tokens": 1910337522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6522147307331229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006664777592447339, + "kl": 0.0129547119140625, + "learning_rate": 6.514875183752704e-06, + "loss": 0.0005, + "num_tokens": 1910899970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6523854228898183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001906819247733083, + "kl": 0.01263427734375, + "learning_rate": 6.509291452545978e-06, + "loss": 0.0005, + "num_tokens": 1911467682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6525561150465136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039263889960988827, + "kl": 0.0130462646484375, + "learning_rate": 6.503708960414781e-06, + "loss": 0.0005, + "num_tokens": 1912032690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.652726807203209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038682432381315177, + "kl": 0.012847900390625, + "learning_rate": 6.4981277093406946e-06, + "loss": 0.0005, + "num_tokens": 1912594898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6528974993599044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004026658597428815, + "kl": 0.0131683349609375, + "learning_rate": 6.49254770130486e-06, + "loss": 0.0005, + "num_tokens": 1913160930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6530681915165998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000481699070139346, + "kl": 0.01300048828125, + "learning_rate": 6.486968938287977e-06, + "loss": 0.0005, + "num_tokens": 1913724050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6532388836732952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001072395074727968, + "kl": 0.0133819580078125, + "learning_rate": 6.481391422270311e-06, + "loss": 0.0005, + "num_tokens": 1914287618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6534095758299906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005063797149681296, + "kl": 0.0133209228515625, + "learning_rate": 6.475815155231677e-06, + "loss": 0.0005, + "num_tokens": 1914853794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.653580267986686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026231997578705316, + "kl": 0.01324462890625, + "learning_rate": 6.470240139151445e-06, + "loss": 0.0005, + "num_tokens": 1915421442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6537509601433814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006813538913379891, + "kl": 0.0137481689453125, + "learning_rate": 6.464666376008544e-06, + "loss": 0.0006, + "num_tokens": 1915983202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6539216523000768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010426850205703532, + "kl": 0.0126495361328125, + "learning_rate": 6.459093867781463e-06, + "loss": 0.0005, + "num_tokens": 1916552274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6540923444567722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012755798697971317, + "kl": 0.0127105712890625, + "learning_rate": 6.453522616448243e-06, + "loss": 0.0005, + "num_tokens": 1917116274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6542630366134676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002822604727706737, + "kl": 0.01361083984375, + "learning_rate": 6.447952623986469e-06, + "loss": 0.0005, + "num_tokens": 1917680226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.654433728770163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011475931157054785, + "kl": 0.0126800537109375, + "learning_rate": 6.442383892373294e-06, + "loss": 0.0005, + "num_tokens": 1918246946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6546044209268584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002184348228922625, + "kl": 0.01300048828125, + "learning_rate": 6.436816423585415e-06, + "loss": 0.0005, + "num_tokens": 1918811026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6547751130835539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007421790920601051, + "kl": 0.0123138427734375, + "learning_rate": 6.431250219599083e-06, + "loss": 0.0005, + "num_tokens": 1919379026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6549458052402493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023711381665191903, + "kl": 0.01324462890625, + "learning_rate": 6.425685282390096e-06, + "loss": 0.0005, + "num_tokens": 1919947970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6551164973969447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006295389718083945, + "kl": 0.013580322265625, + "learning_rate": 6.42012161393381e-06, + "loss": 0.0005, + "num_tokens": 1920512786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.65528718955364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011071670206677163, + "kl": 0.013153076171875, + "learning_rate": 6.414559216205125e-06, + "loss": 0.0005, + "num_tokens": 1921078994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6554578817103354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015594382794435798, + "kl": 0.0133819580078125, + "learning_rate": 6.408998091178492e-06, + "loss": 0.0005, + "num_tokens": 1921642466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6556285738670308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004826696035476578, + "kl": 0.012451171875, + "learning_rate": 6.403438240827906e-06, + "loss": 0.0005, + "num_tokens": 1922207490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6557992660237262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017605009280735444, + "kl": 0.0133514404296875, + "learning_rate": 6.397879667126918e-06, + "loss": 0.0005, + "num_tokens": 1922773186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6559699581804216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001108821237559112, + "kl": 0.012542724609375, + "learning_rate": 6.392322372048616e-06, + "loss": 0.0005, + "num_tokens": 1923339586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.656140650337117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006282957893666258, + "kl": 0.0132598876953125, + "learning_rate": 6.3867663575656505e-06, + "loss": 0.0005, + "num_tokens": 1923908162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6563113424938124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005479344725278788, + "kl": 0.0125579833984375, + "learning_rate": 6.38121162565019e-06, + "loss": 0.0005, + "num_tokens": 1924482290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6564820346505078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016691317291767216, + "kl": 0.013031005859375, + "learning_rate": 6.375658178273973e-06, + "loss": 0.0005, + "num_tokens": 1925054226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6566527268072032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000767105068866715, + "kl": 0.0136871337890625, + "learning_rate": 6.370106017408272e-06, + "loss": 0.0005, + "num_tokens": 1925616770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6568234189638986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013079093254248998, + "kl": 0.0132293701171875, + "learning_rate": 6.364555145023905e-06, + "loss": 0.0005, + "num_tokens": 1926179122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.656994111120594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029145378420589006, + "kl": 0.013458251953125, + "learning_rate": 6.359005563091229e-06, + "loss": 0.0005, + "num_tokens": 1926745314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6571648032772894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048387971379163045, + "kl": 0.012664794921875, + "learning_rate": 6.3534572735801445e-06, + "loss": 0.0005, + "num_tokens": 1927308706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6573354954339848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004451916100074997, + "kl": 0.0129547119140625, + "learning_rate": 6.347910278460094e-06, + "loss": 0.0005, + "num_tokens": 1927874914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6575061875906802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012157214747454236, + "kl": 0.01312255859375, + "learning_rate": 6.342364579700065e-06, + "loss": 0.0005, + "num_tokens": 1928438802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6576768797473757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007243603636000768, + "kl": 0.01226806640625, + "learning_rate": 6.336820179268577e-06, + "loss": 0.0005, + "num_tokens": 1929004530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6578475719040711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002913828854699218, + "kl": 0.0128326416015625, + "learning_rate": 6.331277079133688e-06, + "loss": 0.0005, + "num_tokens": 1929567314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6580182640607664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007077481682454316, + "kl": 0.0131072998046875, + "learning_rate": 6.325735281263006e-06, + "loss": 0.0005, + "num_tokens": 1930138242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6581889562174618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003962770075033685, + "kl": 0.01324462890625, + "learning_rate": 6.320194787623667e-06, + "loss": 0.0005, + "num_tokens": 1930710386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6583596483741572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006043498560264067, + "kl": 0.01373291015625, + "learning_rate": 6.314655600182343e-06, + "loss": 0.0005, + "num_tokens": 1931275442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6585303405308526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005450648001301892, + "kl": 0.0126190185546875, + "learning_rate": 6.309117720905247e-06, + "loss": 0.0005, + "num_tokens": 1931848082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.658701032687548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007181926055182723, + "kl": 0.013763427734375, + "learning_rate": 6.303581151758127e-06, + "loss": 0.0006, + "num_tokens": 1932412482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6588717248442434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007359883333295608, + "kl": 0.0125579833984375, + "learning_rate": 6.298045894706268e-06, + "loss": 0.0005, + "num_tokens": 1932980962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6590424170009388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003797251576344745, + "kl": 0.0130462646484375, + "learning_rate": 6.292511951714482e-06, + "loss": 0.0005, + "num_tokens": 1933550850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6592131091576342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025480176072396324, + "kl": 0.013092041015625, + "learning_rate": 6.286979324747118e-06, + "loss": 0.0005, + "num_tokens": 1934114658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6593838013143296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003980678962891387, + "kl": 0.0132904052734375, + "learning_rate": 6.281448015768063e-06, + "loss": 0.0005, + "num_tokens": 1934679074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.659554493471025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021646679904271322, + "kl": 0.013397216796875, + "learning_rate": 6.275918026740732e-06, + "loss": 0.0005, + "num_tokens": 1935242002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6597251856277204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009238562289701273, + "kl": 0.0135498046875, + "learning_rate": 6.2703893596280675e-06, + "loss": 0.0005, + "num_tokens": 1935811730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6598958777844158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004023620703333864, + "kl": 0.0130157470703125, + "learning_rate": 6.26486201639255e-06, + "loss": 0.0005, + "num_tokens": 1936371842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6600665699411112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006555720976423634, + "kl": 0.012969970703125, + "learning_rate": 6.259335998996185e-06, + "loss": 0.0005, + "num_tokens": 1936938098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6602372620978066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001275467502537235, + "kl": 0.012603759765625, + "learning_rate": 6.253811309400515e-06, + "loss": 0.0005, + "num_tokens": 1937501154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.660407954254502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009784312563912102, + "kl": 0.0133819580078125, + "learning_rate": 6.248287949566594e-06, + "loss": 0.0005, + "num_tokens": 1938068002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6605786464111975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048004435559686386, + "kl": 0.0136566162109375, + "learning_rate": 6.242765921455025e-06, + "loss": 0.0005, + "num_tokens": 1938632002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6607493385678928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008896201201007265, + "kl": 0.012420654296875, + "learning_rate": 6.2372452270259235e-06, + "loss": 0.0005, + "num_tokens": 1939197634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6609200307245882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007775603492660408, + "kl": 0.01300048828125, + "learning_rate": 6.231725868238945e-06, + "loss": 0.0005, + "num_tokens": 1939763154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6610907228812836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004963785460834732, + "kl": 0.0124359130859375, + "learning_rate": 6.2262078470532496e-06, + "loss": 0.0005, + "num_tokens": 1940326754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.661261415037979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003515536118202356, + "kl": 0.01287841796875, + "learning_rate": 6.220691165427544e-06, + "loss": 0.0005, + "num_tokens": 1940893618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6614321071946744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006587620963708807, + "kl": 0.0136871337890625, + "learning_rate": 6.215175825320048e-06, + "loss": 0.0005, + "num_tokens": 1941465570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6616027993513698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008073543215384557, + "kl": 0.0129852294921875, + "learning_rate": 6.209661828688513e-06, + "loss": 0.0005, + "num_tokens": 1942031458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6617734915080652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006568285476790903, + "kl": 0.0126800537109375, + "learning_rate": 6.2041491774902055e-06, + "loss": 0.0005, + "num_tokens": 1942595490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6619441836647606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003144506992137846, + "kl": 0.0133514404296875, + "learning_rate": 6.1986378736819165e-06, + "loss": 0.0005, + "num_tokens": 1943159666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.662114875821456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000315407283379282, + "kl": 0.0121917724609375, + "learning_rate": 6.1931279192199586e-06, + "loss": 0.0005, + "num_tokens": 1943720690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6622855679781514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020600490630222768, + "kl": 0.01312255859375, + "learning_rate": 6.187619316060174e-06, + "loss": 0.0005, + "num_tokens": 1944283650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6624562601348468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012288092263562239, + "kl": 0.0136260986328125, + "learning_rate": 6.182112066157911e-06, + "loss": 0.0005, + "num_tokens": 1944847522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6626269522915422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032938621470009126, + "kl": 0.0130767822265625, + "learning_rate": 6.176606171468044e-06, + "loss": 0.0005, + "num_tokens": 1945411250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6627976444482376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005983468766849049, + "kl": 0.013427734375, + "learning_rate": 6.1711016339449715e-06, + "loss": 0.0005, + "num_tokens": 1945973266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.662968336604933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006969893801844362, + "kl": 0.0130462646484375, + "learning_rate": 6.165598455542607e-06, + "loss": 0.0005, + "num_tokens": 1946534498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6631390287616284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001810398189031937, + "kl": 0.01275634765625, + "learning_rate": 6.1600966382143734e-06, + "loss": 0.0005, + "num_tokens": 1947103922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6633097209183239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006506652124101387, + "kl": 0.01312255859375, + "learning_rate": 6.15459618391322e-06, + "loss": 0.0005, + "num_tokens": 1947672578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6634804130750191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016694566101694975, + "kl": 0.012725830078125, + "learning_rate": 6.149097094591611e-06, + "loss": 0.0005, + "num_tokens": 1948233986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6636511052317146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021923269264813994, + "kl": 0.013458251953125, + "learning_rate": 6.143599372201525e-06, + "loss": 0.0005, + "num_tokens": 1948797218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.66382179738841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006289094863535533, + "kl": 0.0131072998046875, + "learning_rate": 6.138103018694455e-06, + "loss": 0.0005, + "num_tokens": 1949360418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6639924895451054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014048034608152723, + "kl": 0.0130462646484375, + "learning_rate": 6.1326080360214014e-06, + "loss": 0.0005, + "num_tokens": 1949927202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6641631817018008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021674065119115773, + "kl": 0.0132904052734375, + "learning_rate": 6.127114426132894e-06, + "loss": 0.0005, + "num_tokens": 1950495986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6643338738584962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000757513143916054, + "kl": 0.0135955810546875, + "learning_rate": 6.1216221909789645e-06, + "loss": 0.0005, + "num_tokens": 1951062562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6645045660151916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043612568967362294, + "kl": 0.0125579833984375, + "learning_rate": 6.1161313325091535e-06, + "loss": 0.0005, + "num_tokens": 1951624626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.664675258171887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007987281853436206, + "kl": 0.01300048828125, + "learning_rate": 6.110641852672516e-06, + "loss": 0.0005, + "num_tokens": 1952201922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6648459503285824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015531420058285464, + "kl": 0.013458251953125, + "learning_rate": 6.105153753417628e-06, + "loss": 0.0005, + "num_tokens": 1952765922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6650166424852778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030507427370283336, + "kl": 0.013153076171875, + "learning_rate": 6.099667036692563e-06, + "loss": 0.0005, + "num_tokens": 1953333634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6651873346419732, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.475786606890923e-05, + "kl": 0.01275634765625, + "learning_rate": 6.0941817044448995e-06, + "loss": 0.0005, + "num_tokens": 1953900354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6653580267986686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002571737315746805, + "kl": 0.0130157470703125, + "learning_rate": 6.088697758621743e-06, + "loss": 0.0005, + "num_tokens": 1954466002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.665528718955364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016115788718406521, + "kl": 0.012939453125, + "learning_rate": 6.083215201169692e-06, + "loss": 0.0005, + "num_tokens": 1955032946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6656994111120594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025671278961128947, + "kl": 0.012298583984375, + "learning_rate": 6.077734034034859e-06, + "loss": 0.0005, + "num_tokens": 1955602626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6658701032687548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010940807116001708, + "kl": 0.012847900390625, + "learning_rate": 6.0722542591628544e-06, + "loss": 0.0005, + "num_tokens": 1956179490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6660407954254502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010207419025442359, + "kl": 0.0131988525390625, + "learning_rate": 6.066775878498805e-06, + "loss": 0.0005, + "num_tokens": 1956740962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6662114875821455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046046652737147286, + "kl": 0.013641357421875, + "learning_rate": 6.061298893987337e-06, + "loss": 0.0005, + "num_tokens": 1957318642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.666382179738841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026798912408219125, + "kl": 0.0142669677734375, + "learning_rate": 6.055823307572588e-06, + "loss": 0.0006, + "num_tokens": 1957882418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6665528718955364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024476672559843055, + "kl": 0.01397705078125, + "learning_rate": 6.050349121198182e-06, + "loss": 0.0006, + "num_tokens": 1958448946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6667235640522318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004736025903213875, + "kl": 0.013580322265625, + "learning_rate": 6.044876336807267e-06, + "loss": 0.0005, + "num_tokens": 1959014722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6668942562089272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048532910117187296, + "kl": 0.012786865234375, + "learning_rate": 6.039404956342478e-06, + "loss": 0.0005, + "num_tokens": 1959580850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6670649483656226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009942687872243102, + "kl": 0.01275634765625, + "learning_rate": 6.033934981745964e-06, + "loss": 0.0005, + "num_tokens": 1960155218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.667235640522318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004221416099952854, + "kl": 0.012969970703125, + "learning_rate": 6.028466414959362e-06, + "loss": 0.0005, + "num_tokens": 1960720834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6674063326790134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001693406387726116, + "kl": 0.0129852294921875, + "learning_rate": 6.022999257923821e-06, + "loss": 0.0005, + "num_tokens": 1961287154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6675770248357088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001108893515978488, + "kl": 0.0130615234375, + "learning_rate": 6.017533512579977e-06, + "loss": 0.0005, + "num_tokens": 1961852290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6677477169924042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012859335526243354, + "kl": 0.0128936767578125, + "learning_rate": 6.0120691808679835e-06, + "loss": 0.0005, + "num_tokens": 1962418690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6679184091490996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005254549362213071, + "kl": 0.013397216796875, + "learning_rate": 6.006606264727472e-06, + "loss": 0.0005, + "num_tokens": 1962985810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.668089101305795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006915345428321757, + "kl": 0.01348876953125, + "learning_rate": 6.00114476609758e-06, + "loss": 0.0005, + "num_tokens": 1963550274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6682597934624904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006046493262336881, + "kl": 0.0128631591796875, + "learning_rate": 5.995684686916949e-06, + "loss": 0.0005, + "num_tokens": 1964113058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6684304856191858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030876581945302427, + "kl": 0.0134735107421875, + "learning_rate": 5.990226029123709e-06, + "loss": 0.0005, + "num_tokens": 1964677266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6686011777758812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032043427779657966, + "kl": 0.0130615234375, + "learning_rate": 5.9847687946554815e-06, + "loss": 0.0005, + "num_tokens": 1965238562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6687718699325766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001046233432013191, + "kl": 0.0136566162109375, + "learning_rate": 5.979312985449388e-06, + "loss": 0.0005, + "num_tokens": 1965804274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.668942562089272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015105259586606866, + "kl": 0.01312255859375, + "learning_rate": 5.973858603442049e-06, + "loss": 0.0005, + "num_tokens": 1966365458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6691132542459673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012335208784754533, + "kl": 0.01263427734375, + "learning_rate": 5.968405650569572e-06, + "loss": 0.0005, + "num_tokens": 1966946562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6692839464026628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001755051821793894, + "kl": 0.0142669677734375, + "learning_rate": 5.962954128767555e-06, + "loss": 0.0006, + "num_tokens": 1967515826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6694546385593582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003296544742721886, + "kl": 0.013458251953125, + "learning_rate": 5.957504039971091e-06, + "loss": 0.0005, + "num_tokens": 1968079970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6696253307160536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019407055082212117, + "kl": 0.0130462646484375, + "learning_rate": 5.952055386114769e-06, + "loss": 0.0005, + "num_tokens": 1968642978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.669796022872749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008641028832741368, + "kl": 0.0136871337890625, + "learning_rate": 5.946608169132665e-06, + "loss": 0.0005, + "num_tokens": 1969208082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6699667150294444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007789593053233167, + "kl": 0.0133056640625, + "learning_rate": 5.9411623909583374e-06, + "loss": 0.0005, + "num_tokens": 1969773202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6701374071861398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017071724409357429, + "kl": 0.014251708984375, + "learning_rate": 5.935718053524848e-06, + "loss": 0.0006, + "num_tokens": 1970337026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6703080993428352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007609063343697493, + "kl": 0.0131378173828125, + "learning_rate": 5.9302751587647374e-06, + "loss": 0.0005, + "num_tokens": 1970901474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6704787914995306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006156939093799302, + "kl": 0.0128173828125, + "learning_rate": 5.9248337086100385e-06, + "loss": 0.0005, + "num_tokens": 1971469330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.670649483656226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014820821120622074, + "kl": 0.0134735107421875, + "learning_rate": 5.919393704992265e-06, + "loss": 0.0005, + "num_tokens": 1972036098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6708201758129214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011592986544619282, + "kl": 0.0135650634765625, + "learning_rate": 5.913955149842425e-06, + "loss": 0.0005, + "num_tokens": 1972608530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6709908679696168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00111846732059256, + "kl": 0.01373291015625, + "learning_rate": 5.908518045091007e-06, + "loss": 0.0005, + "num_tokens": 1973169234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6711615601263122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021079579767683252, + "kl": 0.0133209228515625, + "learning_rate": 5.9030823926679955e-06, + "loss": 0.0005, + "num_tokens": 1973733762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6713322522830076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001058350010927238, + "kl": 0.01336669921875, + "learning_rate": 5.89764819450284e-06, + "loss": 0.0005, + "num_tokens": 1974300466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.671502944439703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018357751797183812, + "kl": 0.0131072998046875, + "learning_rate": 5.89221545252449e-06, + "loss": 0.0005, + "num_tokens": 1974867186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6716736365963984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018772712654563812, + "kl": 0.0137176513671875, + "learning_rate": 5.8867841686613695e-06, + "loss": 0.0005, + "num_tokens": 1975431250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6718443287530937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001586298530537407, + "kl": 0.0136566162109375, + "learning_rate": 5.8813543448413965e-06, + "loss": 0.0005, + "num_tokens": 1975996002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6720150209097892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017354823447676042, + "kl": 0.014068603515625, + "learning_rate": 5.8759259829919505e-06, + "loss": 0.0006, + "num_tokens": 1976564930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6721857130664846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017806611912621225, + "kl": 0.0133209228515625, + "learning_rate": 5.870499085039912e-06, + "loss": 0.0005, + "num_tokens": 1977128738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.67235640522318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015096292377576412, + "kl": 0.0134429931640625, + "learning_rate": 5.865073652911631e-06, + "loss": 0.0005, + "num_tokens": 1977699762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6725270973798754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010075652892489787, + "kl": 0.013427734375, + "learning_rate": 5.859649688532943e-06, + "loss": 0.0005, + "num_tokens": 1978268178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6726977895365708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006659887983188475, + "kl": 0.0135650634765625, + "learning_rate": 5.854227193829158e-06, + "loss": 0.0005, + "num_tokens": 1978828338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6728684816932662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012888772977765143, + "kl": 0.013153076171875, + "learning_rate": 5.8488061707250635e-06, + "loss": 0.0005, + "num_tokens": 1979392178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6730391738499616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004322757375181388, + "kl": 0.0135498046875, + "learning_rate": 5.843386621144927e-06, + "loss": 0.0005, + "num_tokens": 1979961778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.673209866006657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001384521448634494, + "kl": 0.01385498046875, + "learning_rate": 5.8379685470125016e-06, + "loss": 0.0006, + "num_tokens": 1980528594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6733805581633524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005606295796090569, + "kl": 0.013946533203125, + "learning_rate": 5.832551950250996e-06, + "loss": 0.0006, + "num_tokens": 1981095538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6735512503200478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008343970887030628, + "kl": 0.01312255859375, + "learning_rate": 5.827136832783109e-06, + "loss": 0.0005, + "num_tokens": 1981662082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6737219424767432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006009881069034554, + "kl": 0.0128631591796875, + "learning_rate": 5.821723196531017e-06, + "loss": 0.0005, + "num_tokens": 1982231410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6738926346334386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007320632692250689, + "kl": 0.0145263671875, + "learning_rate": 5.816311043416371e-06, + "loss": 0.0006, + "num_tokens": 1982802610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.674063326790134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011779037020565567, + "kl": 0.0135345458984375, + "learning_rate": 5.8109003753602735e-06, + "loss": 0.0005, + "num_tokens": 1983370258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6742340189468294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002922671602611309, + "kl": 0.013702392578125, + "learning_rate": 5.805491194283325e-06, + "loss": 0.0005, + "num_tokens": 1983939682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6744047111035248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001467622992330508, + "kl": 0.0130157470703125, + "learning_rate": 5.800083502105589e-06, + "loss": 0.0005, + "num_tokens": 1984507314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6745754032602201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031193996618686, + "kl": 0.0140228271484375, + "learning_rate": 5.794677300746606e-06, + "loss": 0.0006, + "num_tokens": 1985082082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6747460954169155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029630517151927195, + "kl": 0.0142059326171875, + "learning_rate": 5.789272592125378e-06, + "loss": 0.0006, + "num_tokens": 1985647842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.674916787573611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036154487564039395, + "kl": 0.01409912109375, + "learning_rate": 5.7838693781603775e-06, + "loss": 0.0006, + "num_tokens": 1986212754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6750874797303064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002681932613316222, + "kl": 0.013946533203125, + "learning_rate": 5.778467660769553e-06, + "loss": 0.0006, + "num_tokens": 1986778610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6752581718870018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017911566738194724, + "kl": 0.014129638671875, + "learning_rate": 5.773067441870326e-06, + "loss": 0.0006, + "num_tokens": 1987341474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6754288640436972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002543223754912251, + "kl": 0.013916015625, + "learning_rate": 5.7676687233795716e-06, + "loss": 0.0006, + "num_tokens": 1987904786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6755995562003926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036662259170076666, + "kl": 0.013397216796875, + "learning_rate": 5.7622715072136395e-06, + "loss": 0.0005, + "num_tokens": 1988465138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.675770248357088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008574651007579602, + "kl": 0.01348876953125, + "learning_rate": 5.756875795288348e-06, + "loss": 0.0005, + "num_tokens": 1989027410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6759409405137834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007115691979362669, + "kl": 0.013580322265625, + "learning_rate": 5.751481589518987e-06, + "loss": 0.0005, + "num_tokens": 1989594034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6761116326704788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015337239505350002, + "kl": 0.0142974853515625, + "learning_rate": 5.7460888918202984e-06, + "loss": 0.0006, + "num_tokens": 1990164962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6762823248271742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011473594570951, + "kl": 0.01361083984375, + "learning_rate": 5.740697704106491e-06, + "loss": 0.0005, + "num_tokens": 1990741970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6764530169838696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016217396205450982, + "kl": 0.0138092041015625, + "learning_rate": 5.735308028291246e-06, + "loss": 0.0006, + "num_tokens": 1991304786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.676623709140565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004165985664602177, + "kl": 0.013519287109375, + "learning_rate": 5.72991986628771e-06, + "loss": 0.0005, + "num_tokens": 1991868802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6767944012972604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012705581431907148, + "kl": 0.0131378173828125, + "learning_rate": 5.72453322000848e-06, + "loss": 0.0005, + "num_tokens": 1992438146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6769650934539558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008656418703048205, + "kl": 0.013946533203125, + "learning_rate": 5.719148091365618e-06, + "loss": 0.0006, + "num_tokens": 1993007698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6771357856106512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002061368176604904, + "kl": 0.013946533203125, + "learning_rate": 5.7137644822706515e-06, + "loss": 0.0006, + "num_tokens": 1993570450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6773064777673465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006365740415736517, + "kl": 0.0137481689453125, + "learning_rate": 5.7083823946345765e-06, + "loss": 0.0005, + "num_tokens": 1994136626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6774771699240419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000820851276986529, + "kl": 0.012969970703125, + "learning_rate": 5.703001830367831e-06, + "loss": 0.0005, + "num_tokens": 1994703522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6776478620807374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008528480996194086, + "kl": 0.013031005859375, + "learning_rate": 5.697622791380319e-06, + "loss": 0.0005, + "num_tokens": 1995266690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6778185542374328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005765877239621121, + "kl": 0.013275146484375, + "learning_rate": 5.69224527958141e-06, + "loss": 0.0005, + "num_tokens": 1995834690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6779892463941282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013227830547093506, + "kl": 0.0137176513671875, + "learning_rate": 5.6868692968799285e-06, + "loss": 0.0005, + "num_tokens": 1996405010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6781599385508236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012066465912190193, + "kl": 0.0134429931640625, + "learning_rate": 5.6814948451841504e-06, + "loss": 0.0005, + "num_tokens": 1996972546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.678330630707519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010196561886947227, + "kl": 0.0135498046875, + "learning_rate": 5.6761219264018145e-06, + "loss": 0.0005, + "num_tokens": 1997540482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6785013228642144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001244175971366402, + "kl": 0.0130767822265625, + "learning_rate": 5.670750542440108e-06, + "loss": 0.0005, + "num_tokens": 1998107746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6786720150209098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002536552901246478, + "kl": 0.0135040283203125, + "learning_rate": 5.665380695205687e-06, + "loss": 0.0005, + "num_tokens": 1998671426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6788427071776052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009970093690953653, + "kl": 0.013153076171875, + "learning_rate": 5.660012386604643e-06, + "loss": 0.0005, + "num_tokens": 1999236226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6790133993343006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003807436214613376, + "kl": 0.013671875, + "learning_rate": 5.6546456185425425e-06, + "loss": 0.0005, + "num_tokens": 1999803538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.679184091490996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013658921673352158, + "kl": 0.0154266357421875, + "learning_rate": 5.649280392924384e-06, + "loss": 0.0006, + "num_tokens": 2000372578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6793547836476914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0384103104477591, + "kl": 0.01788330078125, + "learning_rate": 5.643916711654639e-06, + "loss": 0.0007, + "num_tokens": 2000941074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6795254758043868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018730096126433335, + "kl": 0.013427734375, + "learning_rate": 5.638554576637212e-06, + "loss": 0.0005, + "num_tokens": 2001506482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6796961679610822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001554192452488385, + "kl": 0.013671875, + "learning_rate": 5.633193989775474e-06, + "loss": 0.0005, + "num_tokens": 2002076034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6798668601177776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011247880727243177, + "kl": 0.013885498046875, + "learning_rate": 5.627834952972233e-06, + "loss": 0.0006, + "num_tokens": 2002639698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6800375522744729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001166269707338999, + "kl": 0.0141143798828125, + "learning_rate": 5.622477468129764e-06, + "loss": 0.0006, + "num_tokens": 2003200866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6802082444311683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001058110824040161, + "kl": 0.014434814453125, + "learning_rate": 5.617121537149768e-06, + "loss": 0.0006, + "num_tokens": 2003772866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6803789365878637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038384188487291366, + "kl": 0.0148468017578125, + "learning_rate": 5.611767161933417e-06, + "loss": 0.0006, + "num_tokens": 2004334898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6805496287445592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009828092494126525, + "kl": 0.0144805908203125, + "learning_rate": 5.6064143443813145e-06, + "loss": 0.0006, + "num_tokens": 2004903410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6807203209012546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011139257270491591, + "kl": 0.013641357421875, + "learning_rate": 5.6010630863935245e-06, + "loss": 0.0005, + "num_tokens": 2005471426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.68089101305795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011205614150965797, + "kl": 0.0155181884765625, + "learning_rate": 5.595713389869542e-06, + "loss": 0.0006, + "num_tokens": 2006036514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6810617052146454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012308020610472294, + "kl": 0.014404296875, + "learning_rate": 5.590365256708321e-06, + "loss": 0.0006, + "num_tokens": 2006599794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6812323973713408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011775057517265115, + "kl": 0.015167236328125, + "learning_rate": 5.585018688808259e-06, + "loss": 0.0006, + "num_tokens": 2007168594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6814030895280362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017019701706645564, + "kl": 0.014923095703125, + "learning_rate": 5.579673688067191e-06, + "loss": 0.0006, + "num_tokens": 2007734946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6815737816847316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023117741201002426, + "kl": 0.0158538818359375, + "learning_rate": 5.5743302563823945e-06, + "loss": 0.0006, + "num_tokens": 2008299410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.681744473841427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020009419243980616, + "kl": 0.0142822265625, + "learning_rate": 5.5689883956506e-06, + "loss": 0.0006, + "num_tokens": 2008864818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6819151659981224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004201580010756052, + "kl": 0.0146026611328125, + "learning_rate": 5.563648107767978e-06, + "loss": 0.0006, + "num_tokens": 2009428114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6820858581548178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007998926854956715, + "kl": 0.0143890380859375, + "learning_rate": 5.558309394630135e-06, + "loss": 0.0006, + "num_tokens": 2009987826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6822565503115132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001908960082307968, + "kl": 0.014923095703125, + "learning_rate": 5.552972258132117e-06, + "loss": 0.0006, + "num_tokens": 2010548354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6824272424682086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006965591584812454, + "kl": 0.0144500732421875, + "learning_rate": 5.5476367001684196e-06, + "loss": 0.0006, + "num_tokens": 2011122274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.682597934624904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013159554221011188, + "kl": 0.0143890380859375, + "learning_rate": 5.542302722632975e-06, + "loss": 0.0006, + "num_tokens": 2011702914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6827686267815993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003631957957223891, + "kl": 0.0147705078125, + "learning_rate": 5.536970327419151e-06, + "loss": 0.0006, + "num_tokens": 2012264626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6829393189382947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002488864472340753, + "kl": 0.014678955078125, + "learning_rate": 5.531639516419751e-06, + "loss": 0.0006, + "num_tokens": 2012834930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6831100110949901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015419255192490207, + "kl": 0.0154266357421875, + "learning_rate": 5.526310291527024e-06, + "loss": 0.0006, + "num_tokens": 2013403010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6832807032516856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001883896886204168, + "kl": 0.01446533203125, + "learning_rate": 5.5209826546326575e-06, + "loss": 0.0006, + "num_tokens": 2013966850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.683451395408381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029016389938614274, + "kl": 0.0147247314453125, + "learning_rate": 5.515656607627764e-06, + "loss": 0.0006, + "num_tokens": 2014537538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6836220875650764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013074128534472924, + "kl": 0.014190673828125, + "learning_rate": 5.510332152402896e-06, + "loss": 0.0006, + "num_tokens": 2015105202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6837927797217718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020107026352227217, + "kl": 0.014923095703125, + "learning_rate": 5.505009290848047e-06, + "loss": 0.0006, + "num_tokens": 2015671266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6839634718784672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022778178852930564, + "kl": 0.0134735107421875, + "learning_rate": 5.499688024852642e-06, + "loss": 0.0005, + "num_tokens": 2016238258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6841341640351626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003419386213772866, + "kl": 0.015228271484375, + "learning_rate": 5.494368356305537e-06, + "loss": 0.0006, + "num_tokens": 2016805442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.684304856191858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002371686368106693, + "kl": 0.01416015625, + "learning_rate": 5.489050287095017e-06, + "loss": 0.0006, + "num_tokens": 2017374706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6844755483485534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014666409787068098, + "kl": 0.0141448974609375, + "learning_rate": 5.4837338191088095e-06, + "loss": 0.0006, + "num_tokens": 2017954690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6846462405052488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024942140431255164, + "kl": 0.0151824951171875, + "learning_rate": 5.478418954234069e-06, + "loss": 0.0006, + "num_tokens": 2018521026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6848169326619442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013833721385473339, + "kl": 0.014190673828125, + "learning_rate": 5.473105694357388e-06, + "loss": 0.0006, + "num_tokens": 2019085970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6849876248186396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019902487453243114, + "kl": 0.0137481689453125, + "learning_rate": 5.467794041364766e-06, + "loss": 0.0006, + "num_tokens": 2019659362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.685158316975335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047081883459276297, + "kl": 0.0139923095703125, + "learning_rate": 5.462483997141655e-06, + "loss": 0.0006, + "num_tokens": 2020223570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6853290091320304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004780870422315888, + "kl": 0.0143890380859375, + "learning_rate": 5.457175563572932e-06, + "loss": 0.0006, + "num_tokens": 2020790610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6854997012887258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005543606786310977, + "kl": 0.0138092041015625, + "learning_rate": 5.4518687425429e-06, + "loss": 0.0006, + "num_tokens": 2021353794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6856703934454211, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001221507686325078, + "kl": 0.014495849609375, + "learning_rate": 5.446563535935286e-06, + "loss": 0.0006, + "num_tokens": 2021926018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6858410856021165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028201456444512277, + "kl": 0.01422119140625, + "learning_rate": 5.441259945633244e-06, + "loss": 0.0006, + "num_tokens": 2022504034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.686011777758812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026173498020411563, + "kl": 0.0142822265625, + "learning_rate": 5.43595797351936e-06, + "loss": 0.0006, + "num_tokens": 2023072354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6861824699155074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001507396114158326, + "kl": 0.0143890380859375, + "learning_rate": 5.430657621475647e-06, + "loss": 0.0006, + "num_tokens": 2023638994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6863531620722028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008625417564884596, + "kl": 0.0141143798828125, + "learning_rate": 5.425358891383535e-06, + "loss": 0.0006, + "num_tokens": 2024204242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6865238542288982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010518413624394233, + "kl": 0.0134124755859375, + "learning_rate": 5.420061785123876e-06, + "loss": 0.0005, + "num_tokens": 2024773474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6866945463855936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001837096200394185, + "kl": 0.0133819580078125, + "learning_rate": 5.414766304576958e-06, + "loss": 0.0005, + "num_tokens": 2025336306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.686865238542289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005399270883347621, + "kl": 0.0136260986328125, + "learning_rate": 5.4094724516224885e-06, + "loss": 0.0005, + "num_tokens": 2025900962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6870359306989844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030879337730186637, + "kl": 0.013336181640625, + "learning_rate": 5.404180228139591e-06, + "loss": 0.0005, + "num_tokens": 2026472674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6872066228556798, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012283832408713636, + "kl": 0.013824462890625, + "learning_rate": 5.398889636006808e-06, + "loss": 0.0006, + "num_tokens": 2027041986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6873773150123752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004801940916413811, + "kl": 0.01361083984375, + "learning_rate": 5.393600677102113e-06, + "loss": 0.0005, + "num_tokens": 2027615842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6875480071690706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010769311876565706, + "kl": 0.013214111328125, + "learning_rate": 5.388313353302901e-06, + "loss": 0.0005, + "num_tokens": 2028175826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.687718699325766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005696988759254778, + "kl": 0.013580322265625, + "learning_rate": 5.383027666485976e-06, + "loss": 0.0005, + "num_tokens": 2028737570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6878893914824614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021010605996439174, + "kl": 0.0130462646484375, + "learning_rate": 5.3777436185275624e-06, + "loss": 0.0005, + "num_tokens": 2029299826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6880600836391568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014284322227801813, + "kl": 0.012542724609375, + "learning_rate": 5.3724612113033105e-06, + "loss": 0.0005, + "num_tokens": 2029868226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6882307757958522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005529224777303945, + "kl": 0.0135650634765625, + "learning_rate": 5.367180446688287e-06, + "loss": 0.0005, + "num_tokens": 2030435762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6884014679525475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006578223625274277, + "kl": 0.013275146484375, + "learning_rate": 5.361901326556967e-06, + "loss": 0.0005, + "num_tokens": 2031001970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6885721601092429, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007731395821735989, + "kl": 0.0135040283203125, + "learning_rate": 5.356623852783253e-06, + "loss": 0.0005, + "num_tokens": 2031568722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6887428522659383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007051162013352676, + "kl": 0.0128021240234375, + "learning_rate": 5.351348027240453e-06, + "loss": 0.0005, + "num_tokens": 2032134130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6889135444226338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007591507334906212, + "kl": 0.013580322265625, + "learning_rate": 5.346073851801299e-06, + "loss": 0.0005, + "num_tokens": 2032697362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6890842365793292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004470866690432807, + "kl": 0.0127105712890625, + "learning_rate": 5.340801328337927e-06, + "loss": 0.0005, + "num_tokens": 2033270082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6892549287360246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002755388172554218, + "kl": 0.013824462890625, + "learning_rate": 5.3355304587219e-06, + "loss": 0.0006, + "num_tokens": 2033838290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.68942562089272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008043766413300253, + "kl": 0.0130767822265625, + "learning_rate": 5.330261244824178e-06, + "loss": 0.0005, + "num_tokens": 2034402418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6895963130494154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012836845996464811, + "kl": 0.0127410888671875, + "learning_rate": 5.324993688515152e-06, + "loss": 0.0005, + "num_tokens": 2034961138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6897670052061108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035557196111827785, + "kl": 0.013092041015625, + "learning_rate": 5.319727791664603e-06, + "loss": 0.0005, + "num_tokens": 2035527410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6899376973628062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014711600988193514, + "kl": 0.013336181640625, + "learning_rate": 5.314463556141743e-06, + "loss": 0.0005, + "num_tokens": 2036097138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6901083895195016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009162830496017082, + "kl": 0.0127410888671875, + "learning_rate": 5.30920098381518e-06, + "loss": 0.0005, + "num_tokens": 2036661618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.690279081676197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006771204840480093, + "kl": 0.0133209228515625, + "learning_rate": 5.303940076552943e-06, + "loss": 0.0005, + "num_tokens": 2037222674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6904497738328924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005282215923293642, + "kl": 0.0139312744140625, + "learning_rate": 5.298680836222457e-06, + "loss": 0.0006, + "num_tokens": 2037783426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6906204659895878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007170311866378839, + "kl": 0.0134429931640625, + "learning_rate": 5.29342326469057e-06, + "loss": 0.0005, + "num_tokens": 2038343698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6907911581462832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002599103485363529, + "kl": 0.0135040283203125, + "learning_rate": 5.2881673638235235e-06, + "loss": 0.0005, + "num_tokens": 2038910242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6909618503029786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005437320214390108, + "kl": 0.0130615234375, + "learning_rate": 5.282913135486978e-06, + "loss": 0.0005, + "num_tokens": 2039476722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6911325424596739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005622381895974603, + "kl": 0.0129547119140625, + "learning_rate": 5.277660581545989e-06, + "loss": 0.0005, + "num_tokens": 2040041986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6913032346163693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010515464264497349, + "kl": 0.013885498046875, + "learning_rate": 5.272409703865027e-06, + "loss": 0.0006, + "num_tokens": 2040613618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6914739267730647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002516936164378546, + "kl": 0.0131378173828125, + "learning_rate": 5.267160504307968e-06, + "loss": 0.0005, + "num_tokens": 2041176434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6916446189297601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001933952506546423, + "kl": 0.01263427734375, + "learning_rate": 5.261912984738083e-06, + "loss": 0.0005, + "num_tokens": 2041743106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6918153110864556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001126822280805462, + "kl": 0.0132598876953125, + "learning_rate": 5.256667147018051e-06, + "loss": 0.0005, + "num_tokens": 2042307202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.691986003243151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010152976715198357, + "kl": 0.0126495361328125, + "learning_rate": 5.251422993009956e-06, + "loss": 0.0005, + "num_tokens": 2042868786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6921566953998464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009220288440440316, + "kl": 0.01348876953125, + "learning_rate": 5.24618052457529e-06, + "loss": 0.0005, + "num_tokens": 2043439634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6923273875565418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009567279833887128, + "kl": 0.0128021240234375, + "learning_rate": 5.240939743574933e-06, + "loss": 0.0005, + "num_tokens": 2044009858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6924980797132372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007565033262986509, + "kl": 0.012939453125, + "learning_rate": 5.235700651869173e-06, + "loss": 0.0005, + "num_tokens": 2044583186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6926687718699326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004339050959612615, + "kl": 0.0130157470703125, + "learning_rate": 5.230463251317699e-06, + "loss": 0.0005, + "num_tokens": 2045155490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.692839464026628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005059108108826868, + "kl": 0.0130462646484375, + "learning_rate": 5.225227543779604e-06, + "loss": 0.0005, + "num_tokens": 2045720994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6930101561833234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002500626234886813, + "kl": 0.0128631591796875, + "learning_rate": 5.219993531113372e-06, + "loss": 0.0005, + "num_tokens": 2046284802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6931808483400188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032232616486499452, + "kl": 0.01385498046875, + "learning_rate": 5.214761215176884e-06, + "loss": 0.0006, + "num_tokens": 2046853922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6933515404967142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006245690639935527, + "kl": 0.0129241943359375, + "learning_rate": 5.209530597827428e-06, + "loss": 0.0005, + "num_tokens": 2047422258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6935222326534096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002919476342963112, + "kl": 0.0126800537109375, + "learning_rate": 5.204301680921688e-06, + "loss": 0.0005, + "num_tokens": 2047983426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.693692924810105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009208406695942679, + "kl": 0.0126800537109375, + "learning_rate": 5.199074466315735e-06, + "loss": 0.0005, + "num_tokens": 2048542082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6938636169668003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032088291104797917, + "kl": 0.01312255859375, + "learning_rate": 5.193848955865041e-06, + "loss": 0.0005, + "num_tokens": 2049106690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6940343091234957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009885895937299556, + "kl": 0.0126190185546875, + "learning_rate": 5.188625151424474e-06, + "loss": 0.0005, + "num_tokens": 2049692706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6942050012801911, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001097791492371463, + "kl": 0.0128936767578125, + "learning_rate": 5.1834030548483014e-06, + "loss": 0.0005, + "num_tokens": 2050258130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6943756934368865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006977039786804909, + "kl": 0.01348876953125, + "learning_rate": 5.178182667990177e-06, + "loss": 0.0005, + "num_tokens": 2050819042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.694546385593582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006109265883103731, + "kl": 0.0129852294921875, + "learning_rate": 5.172963992703142e-06, + "loss": 0.0005, + "num_tokens": 2051385362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6947170777502774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033981531190832603, + "kl": 0.0133819580078125, + "learning_rate": 5.1677470308396445e-06, + "loss": 0.0005, + "num_tokens": 2051951794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6948877699069728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021103584584865976, + "kl": 0.0131072998046875, + "learning_rate": 5.1625317842515165e-06, + "loss": 0.0005, + "num_tokens": 2052515154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6950584620636682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002996564110022568, + "kl": 0.01300048828125, + "learning_rate": 5.1573182547899905e-06, + "loss": 0.0005, + "num_tokens": 2053080706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6952291542203636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030852816592754137, + "kl": 0.012847900390625, + "learning_rate": 5.152106444305665e-06, + "loss": 0.0005, + "num_tokens": 2053651042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.695399846377059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007744232421197801, + "kl": 0.0129547119140625, + "learning_rate": 5.1468963546485536e-06, + "loss": 0.0005, + "num_tokens": 2054218882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6955705385337544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000158898965544772, + "kl": 0.01287841796875, + "learning_rate": 5.141687987668049e-06, + "loss": 0.0005, + "num_tokens": 2054783858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6957412306904498, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017176516466799676, + "kl": 0.0131988525390625, + "learning_rate": 5.1364813452129384e-06, + "loss": 0.0005, + "num_tokens": 2055349394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6959119228471452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009333904880156296, + "kl": 0.013275146484375, + "learning_rate": 5.131276429131385e-06, + "loss": 0.0005, + "num_tokens": 2055914738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6960826150038406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011871890777681764, + "kl": 0.0124969482421875, + "learning_rate": 5.126073241270946e-06, + "loss": 0.0005, + "num_tokens": 2056485298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.696253307160536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007594439189445981, + "kl": 0.01324462890625, + "learning_rate": 5.120871783478568e-06, + "loss": 0.0005, + "num_tokens": 2057050274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6964239993172314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009161208414177204, + "kl": 0.012359619140625, + "learning_rate": 5.115672057600585e-06, + "loss": 0.0005, + "num_tokens": 2057616978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6965946914739267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002674072718649113, + "kl": 0.012908935546875, + "learning_rate": 5.110474065482705e-06, + "loss": 0.0005, + "num_tokens": 2058183218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6967653836306221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019053608566542844, + "kl": 0.01263427734375, + "learning_rate": 5.105277808970028e-06, + "loss": 0.0005, + "num_tokens": 2058752674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6969360757873175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005987708475630386, + "kl": 0.01263427734375, + "learning_rate": 5.10008328990704e-06, + "loss": 0.0005, + "num_tokens": 2059323826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6971067679440129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023003403588312034, + "kl": 0.0117645263671875, + "learning_rate": 5.0948905101376105e-06, + "loss": 0.0005, + "num_tokens": 2059888146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6972774601007083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003322909740613575, + "kl": 0.0127410888671875, + "learning_rate": 5.089699471504985e-06, + "loss": 0.0005, + "num_tokens": 2060453922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6974481522574038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006252561842265382, + "kl": 0.0131988525390625, + "learning_rate": 5.084510175851791e-06, + "loss": 0.0005, + "num_tokens": 2061016018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6976188444140992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003478712543589122, + "kl": 0.0128021240234375, + "learning_rate": 5.079322625020047e-06, + "loss": 0.0005, + "num_tokens": 2061583778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6977895365707946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006237013157847694, + "kl": 0.0129547119140625, + "learning_rate": 5.074136820851149e-06, + "loss": 0.0005, + "num_tokens": 2062144642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.69796022872749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010478994927594263, + "kl": 0.0127410888671875, + "learning_rate": 5.068952765185865e-06, + "loss": 0.0005, + "num_tokens": 2062711826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6981309208841854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042538671137968187, + "kl": 0.012939453125, + "learning_rate": 5.063770459864346e-06, + "loss": 0.0005, + "num_tokens": 2063280386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6983016130408808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002729520626238864, + "kl": 0.0131072998046875, + "learning_rate": 5.058589906726125e-06, + "loss": 0.0005, + "num_tokens": 2063845186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6984723051975762, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027109451693538506, + "kl": 0.013427734375, + "learning_rate": 5.0534111076101165e-06, + "loss": 0.0005, + "num_tokens": 2064414978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6986429973542716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007783497390880804, + "kl": 0.013031005859375, + "learning_rate": 5.0482340643546e-06, + "loss": 0.0005, + "num_tokens": 2064979170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.698813689510967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008701426106507467, + "kl": 0.0131072998046875, + "learning_rate": 5.0430587787972454e-06, + "loss": 0.0005, + "num_tokens": 2065542370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6989843816676624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032621614481580333, + "kl": 0.0127410888671875, + "learning_rate": 5.037885252775085e-06, + "loss": 0.0005, + "num_tokens": 2066105138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6991550738243578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002590895913976236, + "kl": 0.01318359375, + "learning_rate": 5.032713488124542e-06, + "loss": 0.0005, + "num_tokens": 2066670818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6993257659810531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003801483768817894, + "kl": 0.0126190185546875, + "learning_rate": 5.027543486681399e-06, + "loss": 0.0005, + "num_tokens": 2067233746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6994964581377485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002915664591837247, + "kl": 0.01287841796875, + "learning_rate": 5.0223752502808255e-06, + "loss": 0.0005, + "num_tokens": 2067799138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6996671502944439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002303944633269534, + "kl": 0.012847900390625, + "learning_rate": 5.017208780757353e-06, + "loss": 0.0005, + "num_tokens": 2068365570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6998378424511393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002111876076640373, + "kl": 0.013275146484375, + "learning_rate": 5.012044079944897e-06, + "loss": 0.0005, + "num_tokens": 2068929346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7000085346078347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009698830824295333, + "kl": 0.0131683349609375, + "learning_rate": 5.0068811496767365e-06, + "loss": 0.0005, + "num_tokens": 2069494146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7001792267645301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030899187371837315, + "kl": 0.0125732421875, + "learning_rate": 5.00171999178553e-06, + "loss": 0.0005, + "num_tokens": 2070057170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7003499189212256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003807762877489122, + "kl": 0.012603759765625, + "learning_rate": 4.9965606081032965e-06, + "loss": 0.0005, + "num_tokens": 2070625298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.700520611077921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000417587900027891, + "kl": 0.0125579833984375, + "learning_rate": 4.991403000461432e-06, + "loss": 0.0005, + "num_tokens": 2071187970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7006913032346164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011674059708920823, + "kl": 0.0130615234375, + "learning_rate": 4.986247170690702e-06, + "loss": 0.0005, + "num_tokens": 2071751650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7008619953913118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045207304563459164, + "kl": 0.0126800537109375, + "learning_rate": 4.981093120621243e-06, + "loss": 0.0005, + "num_tokens": 2072318498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7010326875480072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006162815461755473, + "kl": 0.013031005859375, + "learning_rate": 4.975940852082555e-06, + "loss": 0.0005, + "num_tokens": 2072887058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7012033797047026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018896335782344483, + "kl": 0.01348876953125, + "learning_rate": 4.970790366903503e-06, + "loss": 0.0005, + "num_tokens": 2073449586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.701374071861398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000495950466175603, + "kl": 0.012908935546875, + "learning_rate": 4.965641666912325e-06, + "loss": 0.0005, + "num_tokens": 2074016114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7015447640180934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012511952969241403, + "kl": 0.012725830078125, + "learning_rate": 4.960494753936631e-06, + "loss": 0.0005, + "num_tokens": 2074582498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7017154561747888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022440654368900188, + "kl": 0.0129547119140625, + "learning_rate": 4.955349629803383e-06, + "loss": 0.0005, + "num_tokens": 2075145858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7018861483314842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012139178509557491, + "kl": 0.0126800537109375, + "learning_rate": 4.950206296338911e-06, + "loss": 0.0005, + "num_tokens": 2075708466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7020568404881796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012702052093251686, + "kl": 0.013458251953125, + "learning_rate": 4.945064755368917e-06, + "loss": 0.0005, + "num_tokens": 2076275058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7022275326448749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006636918895165873, + "kl": 0.01336669921875, + "learning_rate": 4.939925008718461e-06, + "loss": 0.0005, + "num_tokens": 2076838418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7023982248015703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007834580144128655, + "kl": 0.0132293701171875, + "learning_rate": 4.934787058211978e-06, + "loss": 0.0005, + "num_tokens": 2077410546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7025689169582657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020725954873069676, + "kl": 0.01251220703125, + "learning_rate": 4.929650905673237e-06, + "loss": 0.0005, + "num_tokens": 2077975698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7027396091149611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005782445131283276, + "kl": 0.014129638671875, + "learning_rate": 4.924516552925396e-06, + "loss": 0.0006, + "num_tokens": 2078536386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7029103012716565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008447498437618746, + "kl": 0.012969970703125, + "learning_rate": 4.919384001790963e-06, + "loss": 0.0005, + "num_tokens": 2079102978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.703080993428352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0047473828970523504, + "kl": 0.013916015625, + "learning_rate": 4.9142532540918156e-06, + "loss": 0.0006, + "num_tokens": 2079673090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7032516855850474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000356317145180048, + "kl": 0.012664794921875, + "learning_rate": 4.909124311649176e-06, + "loss": 0.0005, + "num_tokens": 2080236738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7034223777417428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016771042536786634, + "kl": 0.012420654296875, + "learning_rate": 4.903997176283634e-06, + "loss": 0.0005, + "num_tokens": 2080814338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7035930698984382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000545294118462223, + "kl": 0.0134429931640625, + "learning_rate": 4.898871849815137e-06, + "loss": 0.0005, + "num_tokens": 2081381874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7037637620551336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010563611636156328, + "kl": 0.0134124755859375, + "learning_rate": 4.8937483340629985e-06, + "loss": 0.0005, + "num_tokens": 2081947426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.703934454211829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005048619494842838, + "kl": 0.0131988525390625, + "learning_rate": 4.888626630845875e-06, + "loss": 0.0005, + "num_tokens": 2082509698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7041051463685244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000359015743213777, + "kl": 0.0126800537109375, + "learning_rate": 4.8835067419817846e-06, + "loss": 0.0005, + "num_tokens": 2083075458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7042758385252198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011829727289364227, + "kl": 0.013031005859375, + "learning_rate": 4.8783886692881064e-06, + "loss": 0.0005, + "num_tokens": 2083643970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7044465306819152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023647294085285338, + "kl": 0.013458251953125, + "learning_rate": 4.873272414581575e-06, + "loss": 0.0005, + "num_tokens": 2084208850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7046172228386106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009624973168658681, + "kl": 0.0125885009765625, + "learning_rate": 4.868157979678272e-06, + "loss": 0.0005, + "num_tokens": 2084785714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.704787914995306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010317723975565092, + "kl": 0.01263427734375, + "learning_rate": 4.863045366393633e-06, + "loss": 0.0005, + "num_tokens": 2085349426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7049586071520013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026794296504126914, + "kl": 0.0130615234375, + "learning_rate": 4.857934576542456e-06, + "loss": 0.0005, + "num_tokens": 2085915634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7051292993086967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002335888998574936, + "kl": 0.0139617919921875, + "learning_rate": 4.85282561193889e-06, + "loss": 0.0006, + "num_tokens": 2086483058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7052999914653921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002607561065463338, + "kl": 0.01300048828125, + "learning_rate": 4.847718474396429e-06, + "loss": 0.0005, + "num_tokens": 2087048930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7054706836220875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002523156955710405, + "kl": 0.01324462890625, + "learning_rate": 4.84261316572792e-06, + "loss": 0.0005, + "num_tokens": 2087614194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7056413757787829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002839954588130906, + "kl": 0.0128936767578125, + "learning_rate": 4.837509687745565e-06, + "loss": 0.0005, + "num_tokens": 2088179970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7058120679354783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024188359211138578, + "kl": 0.0134735107421875, + "learning_rate": 4.83240804226092e-06, + "loss": 0.0005, + "num_tokens": 2088744994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7059827600921738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000750797552373047, + "kl": 0.01287841796875, + "learning_rate": 4.827308231084877e-06, + "loss": 0.0005, + "num_tokens": 2089319906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7061534522488692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006559129485579062, + "kl": 0.012969970703125, + "learning_rate": 4.822210256027692e-06, + "loss": 0.0005, + "num_tokens": 2089886018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7063241444055646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008529206678614756, + "kl": 0.0134124755859375, + "learning_rate": 4.817114118898956e-06, + "loss": 0.0005, + "num_tokens": 2090455666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.70649483656226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008414524435559663, + "kl": 0.0134124755859375, + "learning_rate": 4.812019821507619e-06, + "loss": 0.0005, + "num_tokens": 2091018114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7066655287189554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005902794044059927, + "kl": 0.0131988525390625, + "learning_rate": 4.806927365661966e-06, + "loss": 0.0005, + "num_tokens": 2091583186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7068362208756508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009708983059576366, + "kl": 0.0131072998046875, + "learning_rate": 4.801836753169643e-06, + "loss": 0.0005, + "num_tokens": 2092157186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7070069130323462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008798559793855267, + "kl": 0.0134735107421875, + "learning_rate": 4.796747985837627e-06, + "loss": 0.0005, + "num_tokens": 2092718674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7071776051890416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010772601045819985, + "kl": 0.0136566162109375, + "learning_rate": 4.791661065472253e-06, + "loss": 0.0005, + "num_tokens": 2093282386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.707348297345737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036064408949800934, + "kl": 0.01312255859375, + "learning_rate": 4.786575993879186e-06, + "loss": 0.0005, + "num_tokens": 2093847986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7075189895024324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003803272838856279, + "kl": 0.0135345458984375, + "learning_rate": 4.7814927728634505e-06, + "loss": 0.0005, + "num_tokens": 2094409746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7076896816591277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007152923842608517, + "kl": 0.0132904052734375, + "learning_rate": 4.776411404229401e-06, + "loss": 0.0005, + "num_tokens": 2094972434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7078603738158231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006036563057858635, + "kl": 0.0127716064453125, + "learning_rate": 4.7713318897807445e-06, + "loss": 0.0005, + "num_tokens": 2095543650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7080310659725185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006469584608101718, + "kl": 0.013275146484375, + "learning_rate": 4.766254231320521e-06, + "loss": 0.0005, + "num_tokens": 2096105410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7082017581292139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007967144529882207, + "kl": 0.0134429931640625, + "learning_rate": 4.761178430651121e-06, + "loss": 0.0005, + "num_tokens": 2096672802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7083724502859093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00033238640325245703, + "kl": 0.013519287109375, + "learning_rate": 4.7561044895742635e-06, + "loss": 0.0005, + "num_tokens": 2097237346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7085431424426047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010307713235180403, + "kl": 0.0128631591796875, + "learning_rate": 4.751032409891023e-06, + "loss": 0.0005, + "num_tokens": 2097803266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7087138345993002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005460648580081019, + "kl": 0.01275634765625, + "learning_rate": 4.7459621934017955e-06, + "loss": 0.0005, + "num_tokens": 2098373922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7088845267559956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006434560748797499, + "kl": 0.0124053955078125, + "learning_rate": 4.740893841906334e-06, + "loss": 0.0005, + "num_tokens": 2098940322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.709055218912691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000539150515382697, + "kl": 0.0123748779296875, + "learning_rate": 4.735827357203712e-06, + "loss": 0.0005, + "num_tokens": 2099505138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7092259110693864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002606150076911488, + "kl": 0.0132904052734375, + "learning_rate": 4.730762741092355e-06, + "loss": 0.0005, + "num_tokens": 2100067746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7093966032260818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001609589136976477, + "kl": 0.0135650634765625, + "learning_rate": 4.725699995370013e-06, + "loss": 0.0005, + "num_tokens": 2100630370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7095672953827772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008012241424633484, + "kl": 0.0135498046875, + "learning_rate": 4.720639121833782e-06, + "loss": 0.0005, + "num_tokens": 2101199410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7097379875394726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006624130636078793, + "kl": 0.0129852294921875, + "learning_rate": 4.715580122280093e-06, + "loss": 0.0005, + "num_tokens": 2101764242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.709908679696168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004898555535322143, + "kl": 0.0127716064453125, + "learning_rate": 4.710522998504703e-06, + "loss": 0.0005, + "num_tokens": 2102330450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7100793718528634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006648132605387921, + "kl": 0.0134735107421875, + "learning_rate": 4.705467752302706e-06, + "loss": 0.0005, + "num_tokens": 2102895666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7102500640095588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004910903537947583, + "kl": 0.0128021240234375, + "learning_rate": 4.700414385468536e-06, + "loss": 0.0005, + "num_tokens": 2103462050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7104207561662541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012996363467409201, + "kl": 0.0130767822265625, + "learning_rate": 4.695362899795958e-06, + "loss": 0.0005, + "num_tokens": 2104028338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7105914483229495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000375321628358221, + "kl": 0.0128173828125, + "learning_rate": 4.690313297078064e-06, + "loss": 0.0005, + "num_tokens": 2104593154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7107621404796449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021759507028902865, + "kl": 0.013275146484375, + "learning_rate": 4.685265579107278e-06, + "loss": 0.0005, + "num_tokens": 2105159154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7109328326363403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001217229200684299, + "kl": 0.013214111328125, + "learning_rate": 4.68021974767536e-06, + "loss": 0.0005, + "num_tokens": 2105722050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7111035247930357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008148893878918757, + "kl": 0.01336669921875, + "learning_rate": 4.675175804573402e-06, + "loss": 0.0005, + "num_tokens": 2106286002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7112742169497311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007502594494697326, + "kl": 0.01300048828125, + "learning_rate": 4.670133751591817e-06, + "loss": 0.0005, + "num_tokens": 2106845234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7114449091064265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045957301285556737, + "kl": 0.01385498046875, + "learning_rate": 4.665093590520352e-06, + "loss": 0.0006, + "num_tokens": 2107409698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.711615601263122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005536105278259377, + "kl": 0.0135345458984375, + "learning_rate": 4.660055323148082e-06, + "loss": 0.0005, + "num_tokens": 2107976338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7117862934198174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040268414818900544, + "kl": 0.01300048828125, + "learning_rate": 4.655018951263415e-06, + "loss": 0.0005, + "num_tokens": 2108540354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7119569855765128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007290666613153871, + "kl": 0.01214599609375, + "learning_rate": 4.649984476654078e-06, + "loss": 0.0005, + "num_tokens": 2109102674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7121276777332082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001461184375504099, + "kl": 0.0137786865234375, + "learning_rate": 4.644951901107123e-06, + "loss": 0.0006, + "num_tokens": 2109669058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7122983698899036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028669380930035136, + "kl": 0.0132904052734375, + "learning_rate": 4.639921226408937e-06, + "loss": 0.0005, + "num_tokens": 2110236802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.712469062046599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014537455389572733, + "kl": 0.0130157470703125, + "learning_rate": 4.634892454345229e-06, + "loss": 0.0005, + "num_tokens": 2110805026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7126397542032944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024816126981579443, + "kl": 0.01385498046875, + "learning_rate": 4.6298655867010365e-06, + "loss": 0.0006, + "num_tokens": 2111375890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7128104463599898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021871039863754545, + "kl": 0.0130767822265625, + "learning_rate": 4.624840625260704e-06, + "loss": 0.0005, + "num_tokens": 2111942546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7129811385166852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003799501810625949, + "kl": 0.012908935546875, + "learning_rate": 4.619817571807915e-06, + "loss": 0.0005, + "num_tokens": 2112508930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7131518306733805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013935960548785143, + "kl": 0.01312255859375, + "learning_rate": 4.614796428125676e-06, + "loss": 0.0005, + "num_tokens": 2113074098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7133225228300759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030697299121906833, + "kl": 0.0124969482421875, + "learning_rate": 4.609777195996316e-06, + "loss": 0.0005, + "num_tokens": 2113637330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7134932149867713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007623850820946365, + "kl": 0.012786865234375, + "learning_rate": 4.604759877201472e-06, + "loss": 0.0005, + "num_tokens": 2114201154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7136639071434667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001070857768081951, + "kl": 0.0132598876953125, + "learning_rate": 4.599744473522113e-06, + "loss": 0.0005, + "num_tokens": 2114764834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7138345993001621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018806325777063796, + "kl": 0.013702392578125, + "learning_rate": 4.594730986738528e-06, + "loss": 0.0005, + "num_tokens": 2115330370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7140052914568575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010971884618592007, + "kl": 0.013946533203125, + "learning_rate": 4.589719418630327e-06, + "loss": 0.0006, + "num_tokens": 2115891666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.714175983613553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000853760761371904, + "kl": 0.0132598876953125, + "learning_rate": 4.5847097709764325e-06, + "loss": 0.0005, + "num_tokens": 2116464642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7143466757702484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006817700745798888, + "kl": 0.0135040283203125, + "learning_rate": 4.5797020455550845e-06, + "loss": 0.0005, + "num_tokens": 2117027682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7145173679269438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002840812034276214, + "kl": 0.0127105712890625, + "learning_rate": 4.574696244143849e-06, + "loss": 0.0005, + "num_tokens": 2117591634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7146880600836392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000516259201120588, + "kl": 0.0126800537109375, + "learning_rate": 4.569692368519608e-06, + "loss": 0.0005, + "num_tokens": 2118153778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7148587522403346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047159630878371086, + "kl": 0.012481689453125, + "learning_rate": 4.564690420458554e-06, + "loss": 0.0005, + "num_tokens": 2118720098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.71502944439703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013108525227311873, + "kl": 0.0133209228515625, + "learning_rate": 4.5596904017361936e-06, + "loss": 0.0005, + "num_tokens": 2119281618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7152001365537254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000605285467664728, + "kl": 0.01324462890625, + "learning_rate": 4.554692314127356e-06, + "loss": 0.0005, + "num_tokens": 2119845874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7153708287104208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010378499839097976, + "kl": 0.013214111328125, + "learning_rate": 4.549696159406187e-06, + "loss": 0.0005, + "num_tokens": 2120409298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7155415208671162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021299478739891393, + "kl": 0.0126953125, + "learning_rate": 4.544701939346137e-06, + "loss": 0.0005, + "num_tokens": 2120976162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7157122130238116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023346760290442266, + "kl": 0.0139923095703125, + "learning_rate": 4.539709655719969e-06, + "loss": 0.0006, + "num_tokens": 2121537762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7158829051805069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000756012270918173, + "kl": 0.0127410888671875, + "learning_rate": 4.5347193102997674e-06, + "loss": 0.0005, + "num_tokens": 2122101410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7160535973372023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004906671312556973, + "kl": 0.013427734375, + "learning_rate": 4.529730904856931e-06, + "loss": 0.0005, + "num_tokens": 2122666306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7162242894938977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005144500795564474, + "kl": 0.0133056640625, + "learning_rate": 4.524744441162152e-06, + "loss": 0.0005, + "num_tokens": 2123239538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7163949816505931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006740130322897038, + "kl": 0.01275634765625, + "learning_rate": 4.519759920985457e-06, + "loss": 0.0005, + "num_tokens": 2123817826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7165656738072885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027941006545162176, + "kl": 0.0131988525390625, + "learning_rate": 4.5147773460961596e-06, + "loss": 0.0005, + "num_tokens": 2124377922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7167363659639839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001258066357012332, + "kl": 0.01220703125, + "learning_rate": 4.5097967182629035e-06, + "loss": 0.0005, + "num_tokens": 2124940866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7169070581206793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007418106301505626, + "kl": 0.0130767822265625, + "learning_rate": 4.504818039253623e-06, + "loss": 0.0005, + "num_tokens": 2125500834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7170777502773747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008986303851207914, + "kl": 0.0128936767578125, + "learning_rate": 4.499841310835578e-06, + "loss": 0.0005, + "num_tokens": 2126063026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7172484424340702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000206556167350325, + "kl": 0.0128021240234375, + "learning_rate": 4.4948665347753205e-06, + "loss": 0.0005, + "num_tokens": 2126638482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7174191345907656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002620812065086261, + "kl": 0.0132598876953125, + "learning_rate": 4.489893712838722e-06, + "loss": 0.0005, + "num_tokens": 2127208546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.717589826747461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010634365174927052, + "kl": 0.0148162841796875, + "learning_rate": 4.484922846790949e-06, + "loss": 0.0006, + "num_tokens": 2127776242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7177605189041564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001786823438741254, + "kl": 0.0128173828125, + "learning_rate": 4.479953938396485e-06, + "loss": 0.0005, + "num_tokens": 2128345106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7179312110608518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029768010103466646, + "kl": 0.0130462646484375, + "learning_rate": 4.474986989419108e-06, + "loss": 0.0005, + "num_tokens": 2128908242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7181019032175472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004928759425010754, + "kl": 0.0132904052734375, + "learning_rate": 4.470022001621912e-06, + "loss": 0.0005, + "num_tokens": 2129476802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7182725953742426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018652034217183576, + "kl": 0.0125579833984375, + "learning_rate": 4.465058976767281e-06, + "loss": 0.0005, + "num_tokens": 2130039282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.718443287530938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005827375185188589, + "kl": 0.0131988525390625, + "learning_rate": 4.4600979166169165e-06, + "loss": 0.0005, + "num_tokens": 2130607714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7186139796876334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000962320874148537, + "kl": 0.01275634765625, + "learning_rate": 4.4551388229318094e-06, + "loss": 0.0005, + "num_tokens": 2131173698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7187846718443287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005836961618367637, + "kl": 0.01318359375, + "learning_rate": 4.450181697472266e-06, + "loss": 0.0005, + "num_tokens": 2131738882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7189553640010241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010360564917731854, + "kl": 0.0129241943359375, + "learning_rate": 4.445226541997878e-06, + "loss": 0.0005, + "num_tokens": 2132303570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7191260561577195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008221995833083669, + "kl": 0.013397216796875, + "learning_rate": 4.440273358267556e-06, + "loss": 0.0005, + "num_tokens": 2132869746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7192967483144149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001858999566488342, + "kl": 0.012908935546875, + "learning_rate": 4.435322148039495e-06, + "loss": 0.0005, + "num_tokens": 2133437970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7194674404711103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023583160234356385, + "kl": 0.013336181640625, + "learning_rate": 4.4303729130712e-06, + "loss": 0.0005, + "num_tokens": 2133999474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7196381326278057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008412077870449418, + "kl": 0.0136260986328125, + "learning_rate": 4.425425655119466e-06, + "loss": 0.0005, + "num_tokens": 2134567378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7198088247845011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011605612528335892, + "kl": 0.013671875, + "learning_rate": 4.420480375940394e-06, + "loss": 0.0005, + "num_tokens": 2135132514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7199795169411966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004269069034567284, + "kl": 0.013641357421875, + "learning_rate": 4.415537077289382e-06, + "loss": 0.0005, + "num_tokens": 2135694370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.720150209097892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001922560648743397, + "kl": 0.0130767822265625, + "learning_rate": 4.410595760921122e-06, + "loss": 0.0005, + "num_tokens": 2136261778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7203209012545874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003779820461965252, + "kl": 0.0139312744140625, + "learning_rate": 4.405656428589597e-06, + "loss": 0.0006, + "num_tokens": 2136831506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7204915934112828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010902786387564335, + "kl": 0.013946533203125, + "learning_rate": 4.400719082048094e-06, + "loss": 0.0006, + "num_tokens": 2137399666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7206622855679782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007719795493412857, + "kl": 0.0131072998046875, + "learning_rate": 4.3957837230492e-06, + "loss": 0.0005, + "num_tokens": 2137976402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7208329777246736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048325352991171824, + "kl": 0.0130767822265625, + "learning_rate": 4.390850353344782e-06, + "loss": 0.0005, + "num_tokens": 2138541138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.721003669881369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008434173339853482, + "kl": 0.0138092041015625, + "learning_rate": 4.385918974686006e-06, + "loss": 0.0006, + "num_tokens": 2139106546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7211743620380644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012473790433042624, + "kl": 0.01361083984375, + "learning_rate": 4.380989588823339e-06, + "loss": 0.0005, + "num_tokens": 2139669874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7213450541947598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005744301706024037, + "kl": 0.013275146484375, + "learning_rate": 4.376062197506536e-06, + "loss": 0.0005, + "num_tokens": 2140234978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7215157463514551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009193336712091026, + "kl": 0.0129852294921875, + "learning_rate": 4.371136802484642e-06, + "loss": 0.0005, + "num_tokens": 2140798514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7216864385081505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016722373556143364, + "kl": 0.013031005859375, + "learning_rate": 4.366213405505987e-06, + "loss": 0.0005, + "num_tokens": 2141359586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7218571306648459, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002647895424354759, + "kl": 0.0135345458984375, + "learning_rate": 4.361292008318206e-06, + "loss": 0.0005, + "num_tokens": 2141933474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7220278228215413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011753916387539413, + "kl": 0.012603759765625, + "learning_rate": 4.35637261266822e-06, + "loss": 0.0005, + "num_tokens": 2142493826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7221985149782367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001042716081352359, + "kl": 0.013458251953125, + "learning_rate": 4.351455220302232e-06, + "loss": 0.0005, + "num_tokens": 2143059202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7223692071349321, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003015099469184966, + "kl": 0.0137176513671875, + "learning_rate": 4.346539832965738e-06, + "loss": 0.0005, + "num_tokens": 2143620594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7225398992916275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043566823161226664, + "kl": 0.0138702392578125, + "learning_rate": 4.341626452403525e-06, + "loss": 0.0006, + "num_tokens": 2144190946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.722710591448323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001405414595974038, + "kl": 0.01312255859375, + "learning_rate": 4.3367150803596646e-06, + "loss": 0.0005, + "num_tokens": 2144757714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7228812836050184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015257381069568038, + "kl": 0.01318359375, + "learning_rate": 4.331805718577526e-06, + "loss": 0.0005, + "num_tokens": 2145328434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7230519757617138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022463918267402497, + "kl": 0.0135498046875, + "learning_rate": 4.32689836879974e-06, + "loss": 0.0005, + "num_tokens": 2145892674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7232226679184092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024411865629660607, + "kl": 0.0128173828125, + "learning_rate": 4.321993032768246e-06, + "loss": 0.0005, + "num_tokens": 2146461074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7233933600751046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009528420160988159, + "kl": 0.0126800537109375, + "learning_rate": 4.317089712224261e-06, + "loss": 0.0005, + "num_tokens": 2147037730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7235640522318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018740380600069248, + "kl": 0.0126495361328125, + "learning_rate": 4.31218840890829e-06, + "loss": 0.0005, + "num_tokens": 2147595890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7237347443884954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001305734719496953, + "kl": 0.0128173828125, + "learning_rate": 4.307289124560117e-06, + "loss": 0.0005, + "num_tokens": 2148169586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7239054365451908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009124833098749526, + "kl": 0.0124969482421875, + "learning_rate": 4.302391860918804e-06, + "loss": 0.0005, + "num_tokens": 2148747810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7240761287018862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007737667961257969, + "kl": 0.0128631591796875, + "learning_rate": 4.297496619722711e-06, + "loss": 0.0005, + "num_tokens": 2149313602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7242468208585815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006863351822667584, + "kl": 0.01275634765625, + "learning_rate": 4.292603402709471e-06, + "loss": 0.0005, + "num_tokens": 2149879522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7244175130152769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002903160921878942, + "kl": 0.0121307373046875, + "learning_rate": 4.287712211615999e-06, + "loss": 0.0005, + "num_tokens": 2150446050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7245882051719723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017022602900133098, + "kl": 0.0123138427734375, + "learning_rate": 4.282823048178486e-06, + "loss": 0.0005, + "num_tokens": 2151013170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7247588973286677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000665608846024684, + "kl": 0.0126953125, + "learning_rate": 4.277935914132413e-06, + "loss": 0.0005, + "num_tokens": 2151581250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7249295894853631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014436567802598923, + "kl": 0.0132904052734375, + "learning_rate": 4.273050811212539e-06, + "loss": 0.0005, + "num_tokens": 2152145778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7251002816420585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048389084061231354, + "kl": 0.0131683349609375, + "learning_rate": 4.268167741152895e-06, + "loss": 0.0005, + "num_tokens": 2152711826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7252709737987539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023270460921340328, + "kl": 0.01324462890625, + "learning_rate": 4.2632867056867945e-06, + "loss": 0.0005, + "num_tokens": 2153282162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7254416659554493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021186964932713077, + "kl": 0.0125885009765625, + "learning_rate": 4.258407706546829e-06, + "loss": 0.0005, + "num_tokens": 2153843810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7256123581121448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0040026478339985246, + "kl": 0.0135650634765625, + "learning_rate": 4.253530745464871e-06, + "loss": 0.0005, + "num_tokens": 2154411554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7257830502688402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036021852251153034, + "kl": 0.012908935546875, + "learning_rate": 4.248655824172063e-06, + "loss": 0.0005, + "num_tokens": 2154974674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7259537424255356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039007403050267614, + "kl": 0.0130157470703125, + "learning_rate": 4.243782944398822e-06, + "loss": 0.0005, + "num_tokens": 2155545090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.726124434582231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001730005183192199, + "kl": 0.0126495361328125, + "learning_rate": 4.238912107874849e-06, + "loss": 0.0005, + "num_tokens": 2156117346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7262951267389264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001537163321610134, + "kl": 0.01226806640625, + "learning_rate": 4.234043316329118e-06, + "loss": 0.0005, + "num_tokens": 2156683714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7264658188956218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006511635923117434, + "kl": 0.0128021240234375, + "learning_rate": 4.22917657148987e-06, + "loss": 0.0005, + "num_tokens": 2157254210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7266365110523172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000855326440592297, + "kl": 0.0129547119140625, + "learning_rate": 4.2243118750846215e-06, + "loss": 0.0005, + "num_tokens": 2157816418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7268072032090126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010100070885046546, + "kl": 0.01336669921875, + "learning_rate": 4.219449228840165e-06, + "loss": 0.0005, + "num_tokens": 2158383170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7269778953657079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003948405152537171, + "kl": 0.012847900390625, + "learning_rate": 4.214588634482573e-06, + "loss": 0.0005, + "num_tokens": 2158947266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7271485875224033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039737261885924765, + "kl": 0.01318359375, + "learning_rate": 4.209730093737169e-06, + "loss": 0.0005, + "num_tokens": 2159516402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7273192796790987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011869899929353187, + "kl": 0.013275146484375, + "learning_rate": 4.204873608328568e-06, + "loss": 0.0005, + "num_tokens": 2160078802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7274899718357941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001089584198686922, + "kl": 0.0126495361328125, + "learning_rate": 4.200019179980641e-06, + "loss": 0.0005, + "num_tokens": 2160643490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7276606639924895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003001125941229442, + "kl": 0.0130767822265625, + "learning_rate": 4.195166810416542e-06, + "loss": 0.0005, + "num_tokens": 2161209874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7278313561491849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003061412911844066, + "kl": 0.0125732421875, + "learning_rate": 4.190316501358679e-06, + "loss": 0.0005, + "num_tokens": 2161777154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7280020483058803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004887835712488063, + "kl": 0.013671875, + "learning_rate": 4.185468254528744e-06, + "loss": 0.0005, + "num_tokens": 2162339794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7281727404625757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022772592201337114, + "kl": 0.0130157470703125, + "learning_rate": 4.180622071647683e-06, + "loss": 0.0005, + "num_tokens": 2162905938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7283434326192711, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.28056176737076e-05, + "kl": 0.01318359375, + "learning_rate": 4.1757779544357216e-06, + "loss": 0.0005, + "num_tokens": 2163492738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7285141247759666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000403927883240715, + "kl": 0.0128936767578125, + "learning_rate": 4.170935904612341e-06, + "loss": 0.0005, + "num_tokens": 2164055218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.728684816932662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044177693928071257, + "kl": 0.0136260986328125, + "learning_rate": 4.166095923896301e-06, + "loss": 0.0005, + "num_tokens": 2164621778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7288555090893574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001459531808181713, + "kl": 0.01275634765625, + "learning_rate": 4.161258014005612e-06, + "loss": 0.0005, + "num_tokens": 2165181762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7290262012460528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023078650450019137, + "kl": 0.0124359130859375, + "learning_rate": 4.156422176657567e-06, + "loss": 0.0005, + "num_tokens": 2165749042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7291968934027482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008181740344556259, + "kl": 0.0126495361328125, + "learning_rate": 4.1515884135687026e-06, + "loss": 0.0005, + "num_tokens": 2166318354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7293675855594436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005195266930723124, + "kl": 0.0132598876953125, + "learning_rate": 4.1467567264548395e-06, + "loss": 0.0005, + "num_tokens": 2166885938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.729538277716139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001884000951560549, + "kl": 0.0130462646484375, + "learning_rate": 4.141927117031045e-06, + "loss": 0.0005, + "num_tokens": 2167457282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7297089698728343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006636605477571755, + "kl": 0.0130462646484375, + "learning_rate": 4.137099587011664e-06, + "loss": 0.0005, + "num_tokens": 2168024338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7298796620295297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010586137738030521, + "kl": 0.0131072998046875, + "learning_rate": 4.132274138110287e-06, + "loss": 0.0005, + "num_tokens": 2168589890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7300503541862251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022747938146885146, + "kl": 0.0133209228515625, + "learning_rate": 4.127450772039778e-06, + "loss": 0.0005, + "num_tokens": 2169156306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7302210463429205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010251909802460786, + "kl": 0.01251220703125, + "learning_rate": 4.122629490512262e-06, + "loss": 0.0005, + "num_tokens": 2169720066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7303917384996159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000261272962309698, + "kl": 0.012969970703125, + "learning_rate": 4.117810295239116e-06, + "loss": 0.0005, + "num_tokens": 2170286498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7305624306563113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003627718253731291, + "kl": 0.0125579833984375, + "learning_rate": 4.112993187930977e-06, + "loss": 0.0005, + "num_tokens": 2170849618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7307331228130067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004966666610031406, + "kl": 0.012176513671875, + "learning_rate": 4.1081781702977474e-06, + "loss": 0.0005, + "num_tokens": 2171417506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7309038149697021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022820800811973932, + "kl": 0.0120391845703125, + "learning_rate": 4.1033652440485884e-06, + "loss": 0.0005, + "num_tokens": 2171982642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7310745071263975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000872647491027104, + "kl": 0.013275146484375, + "learning_rate": 4.098554410891912e-06, + "loss": 0.0005, + "num_tokens": 2172546402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.731245199283093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002818156058012041, + "kl": 0.0124664306640625, + "learning_rate": 4.093745672535385e-06, + "loss": 0.0005, + "num_tokens": 2173108018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7314158914397884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013117748528945268, + "kl": 0.01385498046875, + "learning_rate": 4.088939030685941e-06, + "loss": 0.0006, + "num_tokens": 2173672482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7315865835964838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007841193154263313, + "kl": 0.012939453125, + "learning_rate": 4.084134487049768e-06, + "loss": 0.0005, + "num_tokens": 2174237746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7317572757531792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031760844085115857, + "kl": 0.0128326416015625, + "learning_rate": 4.0793320433323015e-06, + "loss": 0.0005, + "num_tokens": 2174804850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7319279679098746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016932270289560263, + "kl": 0.013153076171875, + "learning_rate": 4.0745317012382325e-06, + "loss": 0.0005, + "num_tokens": 2175368850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.73209866006657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006720534661615652, + "kl": 0.013092041015625, + "learning_rate": 4.0697334624715126e-06, + "loss": 0.0005, + "num_tokens": 2175934274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7322693522232654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010155848289652642, + "kl": 0.012939453125, + "learning_rate": 4.0649373287353454e-06, + "loss": 0.0005, + "num_tokens": 2176499458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7324400443799607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000743849336283778, + "kl": 0.0128936767578125, + "learning_rate": 4.060143301732184e-06, + "loss": 0.0005, + "num_tokens": 2177063634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7326107365366561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004464063158177108, + "kl": 0.01361083984375, + "learning_rate": 4.05535138316373e-06, + "loss": 0.0005, + "num_tokens": 2177634690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7327814286933515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005778737764824943, + "kl": 0.01300048828125, + "learning_rate": 4.050561574730947e-06, + "loss": 0.0005, + "num_tokens": 2178220722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7329521208500469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017996733999990871, + "kl": 0.0128936767578125, + "learning_rate": 4.04577387813404e-06, + "loss": 0.0005, + "num_tokens": 2178783986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7331228130067423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016833077085444323, + "kl": 0.013671875, + "learning_rate": 4.04098829507248e-06, + "loss": 0.0005, + "num_tokens": 2179358258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7332935051634377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007341231208282056, + "kl": 0.0131072998046875, + "learning_rate": 4.0362048272449605e-06, + "loss": 0.0005, + "num_tokens": 2179926322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7334641973201331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000256778575722583, + "kl": 0.01324462890625, + "learning_rate": 4.031423476349445e-06, + "loss": 0.0005, + "num_tokens": 2180500802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7336348894768285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016377336081233138, + "kl": 0.012725830078125, + "learning_rate": 4.026644244083144e-06, + "loss": 0.0005, + "num_tokens": 2181065890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7338055816335239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002726889062185813, + "kl": 0.0134429931640625, + "learning_rate": 4.021867132142516e-06, + "loss": 0.0005, + "num_tokens": 2181627026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7339762737902193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003734036932125693, + "kl": 0.0134429931640625, + "learning_rate": 4.017092142223252e-06, + "loss": 0.0005, + "num_tokens": 2182192946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7341469659469148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003790484306197981, + "kl": 0.01300048828125, + "learning_rate": 4.0123192760203065e-06, + "loss": 0.0005, + "num_tokens": 2182753858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7343176581036102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009239769327238827, + "kl": 0.013214111328125, + "learning_rate": 4.007548535227875e-06, + "loss": 0.0005, + "num_tokens": 2183319490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7344883502603056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020875797526255751, + "kl": 0.0133056640625, + "learning_rate": 4.002779921539403e-06, + "loss": 0.0005, + "num_tokens": 2183881922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.734659042417001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003113144668305225, + "kl": 0.01324462890625, + "learning_rate": 3.99801343664757e-06, + "loss": 0.0005, + "num_tokens": 2184448370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7348297345736964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019859876740798134, + "kl": 0.0134429931640625, + "learning_rate": 3.993249082244305e-06, + "loss": 0.0005, + "num_tokens": 2185010978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7350004267303918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004801385180980005, + "kl": 0.012481689453125, + "learning_rate": 3.988486860020785e-06, + "loss": 0.0005, + "num_tokens": 2185574354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7351711188870872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006520943131040612, + "kl": 0.013427734375, + "learning_rate": 3.983726771667429e-06, + "loss": 0.0005, + "num_tokens": 2186138930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7353418110437825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012198585205775846, + "kl": 0.0128631591796875, + "learning_rate": 3.9789688188738954e-06, + "loss": 0.0005, + "num_tokens": 2186702082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7355125032004779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004211681920158584, + "kl": 0.0132598876953125, + "learning_rate": 3.974213003329079e-06, + "loss": 0.0005, + "num_tokens": 2187270034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7356831953571733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004768502819161919, + "kl": 0.012725830078125, + "learning_rate": 3.96945932672113e-06, + "loss": 0.0005, + "num_tokens": 2187844530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7358538875138687, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.975431433022629e-05, + "kl": 0.013397216796875, + "learning_rate": 3.964707790737432e-06, + "loss": 0.0005, + "num_tokens": 2188414866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7360245796705641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003767705932128693, + "kl": 0.012939453125, + "learning_rate": 3.959958397064607e-06, + "loss": 0.0005, + "num_tokens": 2188974930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7361952718272595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004498625629718572, + "kl": 0.012786865234375, + "learning_rate": 3.955211147388516e-06, + "loss": 0.0005, + "num_tokens": 2189536626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7363659639839549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005702240474590039, + "kl": 0.0129547119140625, + "learning_rate": 3.950466043394262e-06, + "loss": 0.0005, + "num_tokens": 2190098866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7365366561406503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00038364847745648754, + "kl": 0.0139007568359375, + "learning_rate": 3.94572308676619e-06, + "loss": 0.0006, + "num_tokens": 2190673714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7367073482973457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006731248282642011, + "kl": 0.01287841796875, + "learning_rate": 3.940982279187876e-06, + "loss": 0.0005, + "num_tokens": 2191240194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7368780404540411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022720109308922845, + "kl": 0.0127410888671875, + "learning_rate": 3.936243622342132e-06, + "loss": 0.0005, + "num_tokens": 2191804050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7370487326107366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007582702869431331, + "kl": 0.0134124755859375, + "learning_rate": 3.931507117911012e-06, + "loss": 0.0005, + "num_tokens": 2192369554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.737219424767432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006207230892229612, + "kl": 0.0122528076171875, + "learning_rate": 3.926772767575809e-06, + "loss": 0.0005, + "num_tokens": 2192936690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7373901169241274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025840032146032784, + "kl": 0.01324462890625, + "learning_rate": 3.922040573017039e-06, + "loss": 0.0005, + "num_tokens": 2193501842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7375608090808228, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003324002127738467, + "kl": 0.013214111328125, + "learning_rate": 3.917310535914468e-06, + "loss": 0.0005, + "num_tokens": 2194063202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7377315012375182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034031988436032223, + "kl": 0.0130462646484375, + "learning_rate": 3.912582657947081e-06, + "loss": 0.0005, + "num_tokens": 2194636562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7379021933942136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004002073024678717, + "kl": 0.0129241943359375, + "learning_rate": 3.90785694079311e-06, + "loss": 0.0005, + "num_tokens": 2195197698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7380728855509089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005654937223654434, + "kl": 0.0135345458984375, + "learning_rate": 3.9031333861300066e-06, + "loss": 0.0005, + "num_tokens": 2195762754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7382435777076043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010953993583569176, + "kl": 0.0134429931640625, + "learning_rate": 3.898411995634472e-06, + "loss": 0.0005, + "num_tokens": 2196332786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7384142698642997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034196728628301063, + "kl": 0.013214111328125, + "learning_rate": 3.893692770982421e-06, + "loss": 0.0005, + "num_tokens": 2196900018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7385849620209951, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006161399184771567, + "kl": 0.0127410888671875, + "learning_rate": 3.888975713849014e-06, + "loss": 0.0005, + "num_tokens": 2197463986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7387556541776905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016712798061963108, + "kl": 0.01275634765625, + "learning_rate": 3.88426082590863e-06, + "loss": 0.0005, + "num_tokens": 2198030402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7389263463343859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014530487204777614, + "kl": 0.0130615234375, + "learning_rate": 3.879548108834891e-06, + "loss": 0.0005, + "num_tokens": 2198591202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7390970384910813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0058550550744562715, + "kl": 0.013763427734375, + "learning_rate": 3.874837564300634e-06, + "loss": 0.0006, + "num_tokens": 2199152770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7392677306477767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024886267520145603, + "kl": 0.0127716064453125, + "learning_rate": 3.870129193977939e-06, + "loss": 0.0005, + "num_tokens": 2199719266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7394384228044721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014704048978261, + "kl": 0.0130615234375, + "learning_rate": 3.865422999538102e-06, + "loss": 0.0005, + "num_tokens": 2200288258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7396091149611675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010771521758458912, + "kl": 0.01287841796875, + "learning_rate": 3.860718982651658e-06, + "loss": 0.0005, + "num_tokens": 2200856482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.739779807117863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009915140566968267, + "kl": 0.0126953125, + "learning_rate": 3.856017144988356e-06, + "loss": 0.0005, + "num_tokens": 2201418562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7399504992745584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004719521535982117, + "kl": 0.0126495361328125, + "learning_rate": 3.851317488217186e-06, + "loss": 0.0005, + "num_tokens": 2201993698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7401211914312538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003340806829263687, + "kl": 0.0124969482421875, + "learning_rate": 3.846620014006349e-06, + "loss": 0.0005, + "num_tokens": 2202559570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7402918835879492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041845850958699106, + "kl": 0.0133209228515625, + "learning_rate": 3.841924724023282e-06, + "loss": 0.0005, + "num_tokens": 2203123394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7404625757446446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004747471128390719, + "kl": 0.0131378173828125, + "learning_rate": 3.837231619934651e-06, + "loss": 0.0005, + "num_tokens": 2203688354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.74063326790134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006978448760755804, + "kl": 0.0135040283203125, + "learning_rate": 3.832540703406329e-06, + "loss": 0.0005, + "num_tokens": 2204256210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7408039600580353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015980683008573655, + "kl": 0.013427734375, + "learning_rate": 3.8278519761034215e-06, + "loss": 0.0005, + "num_tokens": 2204822898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7409746522147307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037468481855066623, + "kl": 0.0128631591796875, + "learning_rate": 3.823165439690262e-06, + "loss": 0.0005, + "num_tokens": 2205388578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7411453443714261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031072778125743815, + "kl": 0.0139923095703125, + "learning_rate": 3.818481095830403e-06, + "loss": 0.0006, + "num_tokens": 2205952130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7413160365281215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007573001834360017, + "kl": 0.0137786865234375, + "learning_rate": 3.8137989461866153e-06, + "loss": 0.0006, + "num_tokens": 2206515554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7414867286848169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005283864659664891, + "kl": 0.0127105712890625, + "learning_rate": 3.8091189924208893e-06, + "loss": 0.0005, + "num_tokens": 2207076802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7416574208415123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024163238955975195, + "kl": 0.013214111328125, + "learning_rate": 3.804441236194443e-06, + "loss": 0.0005, + "num_tokens": 2207640210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7418281129982077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015760626728185795, + "kl": 0.0126800537109375, + "learning_rate": 3.799765679167714e-06, + "loss": 0.0005, + "num_tokens": 2208208786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7419988051549031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001681320224277456, + "kl": 0.0139007568359375, + "learning_rate": 3.7950923230003533e-06, + "loss": 0.0006, + "num_tokens": 2208772370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7421694973115985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001591921017383193, + "kl": 0.01385498046875, + "learning_rate": 3.7904211693512305e-06, + "loss": 0.0006, + "num_tokens": 2209337842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7423401894682939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044342167367938875, + "kl": 0.013824462890625, + "learning_rate": 3.785752219878439e-06, + "loss": 0.0006, + "num_tokens": 2209906530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7425108816249893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047720781748537914, + "kl": 0.0126190185546875, + "learning_rate": 3.7810854762392922e-06, + "loss": 0.0005, + "num_tokens": 2210475650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7426815737816848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014401021543766338, + "kl": 0.013458251953125, + "learning_rate": 3.7764209400903116e-06, + "loss": 0.0005, + "num_tokens": 2211039522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7428522659383802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010115091337255689, + "kl": 0.01348876953125, + "learning_rate": 3.7717586130872352e-06, + "loss": 0.0005, + "num_tokens": 2211609954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7430229580950756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001142346742445066, + "kl": 0.01239013671875, + "learning_rate": 3.7670984968850244e-06, + "loss": 0.0005, + "num_tokens": 2212178498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.743193650251771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005632225957711704, + "kl": 0.0136260986328125, + "learning_rate": 3.762440593137856e-06, + "loss": 0.0005, + "num_tokens": 2212753042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7433643424084664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009833577430330715, + "kl": 0.0127410888671875, + "learning_rate": 3.7577849034991144e-06, + "loss": 0.0005, + "num_tokens": 2213316450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7435350345651617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035759568407780693, + "kl": 0.0126800537109375, + "learning_rate": 3.753131429621397e-06, + "loss": 0.0005, + "num_tokens": 2213881362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7437057267218571, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043328183034697964, + "kl": 0.0122222900390625, + "learning_rate": 3.7484801731565234e-06, + "loss": 0.0005, + "num_tokens": 2214467970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7438764188785525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006538406092690286, + "kl": 0.01275634765625, + "learning_rate": 3.7438311357555212e-06, + "loss": 0.0005, + "num_tokens": 2215036418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7440471110352479, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006646390649729304, + "kl": 0.0130462646484375, + "learning_rate": 3.739184319068638e-06, + "loss": 0.0005, + "num_tokens": 2215597826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7442178031919433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005501134644575847, + "kl": 0.0128021240234375, + "learning_rate": 3.734539724745313e-06, + "loss": 0.0005, + "num_tokens": 2216161794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7443884953486387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001362885607662609, + "kl": 0.0131988525390625, + "learning_rate": 3.7298973544342145e-06, + "loss": 0.0005, + "num_tokens": 2216737874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7445591875053341, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005434129856798196, + "kl": 0.013336181640625, + "learning_rate": 3.7252572097832173e-06, + "loss": 0.0005, + "num_tokens": 2217311122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7447298796620295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008040284504436874, + "kl": 0.0138397216796875, + "learning_rate": 3.7206192924394103e-06, + "loss": 0.0006, + "num_tokens": 2217875538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7449005718187249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00069199707634641, + "kl": 0.0127105712890625, + "learning_rate": 3.7159836040490804e-06, + "loss": 0.0005, + "num_tokens": 2218443090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7450712639754203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015844386137910536, + "kl": 0.0136871337890625, + "learning_rate": 3.7113501462577284e-06, + "loss": 0.0005, + "num_tokens": 2219008594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7452419561321157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015250085513965346, + "kl": 0.01251220703125, + "learning_rate": 3.7067189207100673e-06, + "loss": 0.0005, + "num_tokens": 2219570162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7454126482888112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005222283430082603, + "kl": 0.013275146484375, + "learning_rate": 3.702089929050018e-06, + "loss": 0.0005, + "num_tokens": 2220136994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7455833404455066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039798812337869, + "kl": 0.01348876953125, + "learning_rate": 3.6974631729207024e-06, + "loss": 0.0005, + "num_tokens": 2220709202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.745754032602202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016776318834758102, + "kl": 0.0131072998046875, + "learning_rate": 3.692838653964449e-06, + "loss": 0.0005, + "num_tokens": 2221270834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7459247247588974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007813223940527523, + "kl": 0.0124359130859375, + "learning_rate": 3.6882163738227973e-06, + "loss": 0.0005, + "num_tokens": 2221836354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7460954169155928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048529636561477096, + "kl": 0.012176513671875, + "learning_rate": 3.6835963341364945e-06, + "loss": 0.0005, + "num_tokens": 2222404338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7462661090722881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001169088323080067, + "kl": 0.012786865234375, + "learning_rate": 3.678978536545483e-06, + "loss": 0.0005, + "num_tokens": 2222971618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7464368012289835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001400594633173844, + "kl": 0.0131072998046875, + "learning_rate": 3.6743629826889136e-06, + "loss": 0.0005, + "num_tokens": 2223538706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7466074933856789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005361256221112275, + "kl": 0.012664794921875, + "learning_rate": 3.669749674205142e-06, + "loss": 0.0005, + "num_tokens": 2224102818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7467781855423743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026445586101873377, + "kl": 0.0134124755859375, + "learning_rate": 3.66513861273173e-06, + "loss": 0.0005, + "num_tokens": 2224666834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7469488776990697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045909495593118334, + "kl": 0.012451171875, + "learning_rate": 3.6605297999054356e-06, + "loss": 0.0005, + "num_tokens": 2225233394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7471195698557651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003954036325759662, + "kl": 0.0130615234375, + "learning_rate": 3.655923237362218e-06, + "loss": 0.0005, + "num_tokens": 2225796562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7472902620124605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005933261437173069, + "kl": 0.0125274658203125, + "learning_rate": 3.6513189267372417e-06, + "loss": 0.0005, + "num_tokens": 2226362466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7474609541691559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032100345637326713, + "kl": 0.012359619140625, + "learning_rate": 3.646716869664876e-06, + "loss": 0.0005, + "num_tokens": 2226926770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7476316463258513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001565652673253134, + "kl": 0.0127105712890625, + "learning_rate": 3.6421170677786787e-06, + "loss": 0.0005, + "num_tokens": 2227490738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7478023384825467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011303879726604405, + "kl": 0.012451171875, + "learning_rate": 3.637519522711418e-06, + "loss": 0.0005, + "num_tokens": 2228055938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7479730306392421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020634790415100112, + "kl": 0.0124359130859375, + "learning_rate": 3.632924236095052e-06, + "loss": 0.0005, + "num_tokens": 2228625170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7481437227959375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005558213480382592, + "kl": 0.01275634765625, + "learning_rate": 3.628331209560747e-06, + "loss": 0.0005, + "num_tokens": 2229191682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.748314414952633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004096426822205295, + "kl": 0.01275634765625, + "learning_rate": 3.623740444738855e-06, + "loss": 0.0005, + "num_tokens": 2229759234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7484851071093284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006779467455977066, + "kl": 0.0131683349609375, + "learning_rate": 3.6191519432589395e-06, + "loss": 0.0005, + "num_tokens": 2230322354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7486557992660238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013806375882098072, + "kl": 0.012725830078125, + "learning_rate": 3.614565706749744e-06, + "loss": 0.0005, + "num_tokens": 2230880722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7488264914227192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009624157336361463, + "kl": 0.013031005859375, + "learning_rate": 3.6099817368392267e-06, + "loss": 0.0005, + "num_tokens": 2231446482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7489971835794145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021224079493095476, + "kl": 0.0133209228515625, + "learning_rate": 3.6054000351545217e-06, + "loss": 0.0005, + "num_tokens": 2232012370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7491678757361099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011961750399002696, + "kl": 0.013702392578125, + "learning_rate": 3.600820603321976e-06, + "loss": 0.0005, + "num_tokens": 2232576834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7493385678928053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029881320419096984, + "kl": 0.0135040283203125, + "learning_rate": 3.5962434429671158e-06, + "loss": 0.0005, + "num_tokens": 2233149490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7495092600495007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007173415657464368, + "kl": 0.0132293701171875, + "learning_rate": 3.5916685557146745e-06, + "loss": 0.0005, + "num_tokens": 2233718946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7496799522061961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007753617530062657, + "kl": 0.0130767822265625, + "learning_rate": 3.5870959431885653e-06, + "loss": 0.0005, + "num_tokens": 2234284866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7498506443628915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004754063037386536, + "kl": 0.01318359375, + "learning_rate": 3.582525607011906e-06, + "loss": 0.0005, + "num_tokens": 2234855954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7500213365195869, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000663388860842221, + "kl": 0.0137939453125, + "learning_rate": 3.577957548806997e-06, + "loss": 0.0006, + "num_tokens": 2235420642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7501920286762823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012566136284778776, + "kl": 0.012939453125, + "learning_rate": 3.573391770195338e-06, + "loss": 0.0005, + "num_tokens": 2235985266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7503627208329777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019269159562880698, + "kl": 0.0129241943359375, + "learning_rate": 3.568828272797611e-06, + "loss": 0.0005, + "num_tokens": 2236549362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7505334129896731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006483540494063024, + "kl": 0.0128631591796875, + "learning_rate": 3.564267058233697e-06, + "loss": 0.0005, + "num_tokens": 2237112834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7507041051463685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016844283001346888, + "kl": 0.0130157470703125, + "learning_rate": 3.5597081281226587e-06, + "loss": 0.0005, + "num_tokens": 2237675714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.750874797303064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026188866952607016, + "kl": 0.0131072998046875, + "learning_rate": 3.555151484082756e-06, + "loss": 0.0005, + "num_tokens": 2238238530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7510454894597594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002128592072172454, + "kl": 0.01361083984375, + "learning_rate": 3.5505971277314276e-06, + "loss": 0.0005, + "num_tokens": 2238801650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7512161816164548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005662116724194143, + "kl": 0.0126953125, + "learning_rate": 3.54604506068531e-06, + "loss": 0.0005, + "num_tokens": 2239368402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7513868737731502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004793622512400073, + "kl": 0.0124053955078125, + "learning_rate": 3.5414952845602235e-06, + "loss": 0.0005, + "num_tokens": 2239932626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7515575659298456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002683880710409418, + "kl": 0.0135498046875, + "learning_rate": 3.5369478009711734e-06, + "loss": 0.0005, + "num_tokens": 2240493858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7517282580865409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019828413958900647, + "kl": 0.0137176513671875, + "learning_rate": 3.532402611532347e-06, + "loss": 0.0005, + "num_tokens": 2241055762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7518989502432363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009636020246121981, + "kl": 0.0123748779296875, + "learning_rate": 3.527859717857127e-06, + "loss": 0.0005, + "num_tokens": 2241616370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7520696423999317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007021374137993342, + "kl": 0.0127410888671875, + "learning_rate": 3.52331912155808e-06, + "loss": 0.0005, + "num_tokens": 2242181794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7522403345566271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046469714466905665, + "kl": 0.0124359130859375, + "learning_rate": 3.518780824246949e-06, + "loss": 0.0005, + "num_tokens": 2242755026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7524110267133225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005763149118551054, + "kl": 0.0126495361328125, + "learning_rate": 3.514244827534664e-06, + "loss": 0.0005, + "num_tokens": 2243317586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7525817188700179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002723826956229933, + "kl": 0.012969970703125, + "learning_rate": 3.509711133031343e-06, + "loss": 0.0005, + "num_tokens": 2243883234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7527524110267133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004386955967700363, + "kl": 0.0133514404296875, + "learning_rate": 3.5051797423462873e-06, + "loss": 0.0005, + "num_tokens": 2244445778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7529231031834087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025237342678032995, + "kl": 0.0140228271484375, + "learning_rate": 3.500650657087974e-06, + "loss": 0.0006, + "num_tokens": 2245015442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7530937953401041, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007506063666805526, + "kl": 0.012786865234375, + "learning_rate": 3.4961238788640595e-06, + "loss": 0.0005, + "num_tokens": 2245578402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7532644874967995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008508823703889069, + "kl": 0.0142669677734375, + "learning_rate": 3.4915994092813933e-06, + "loss": 0.0006, + "num_tokens": 2246146866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7534351796534949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024535495331750554, + "kl": 0.0126190185546875, + "learning_rate": 3.4870772499459994e-06, + "loss": 0.0005, + "num_tokens": 2246710514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7536058718101903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008913340402549666, + "kl": 0.0125732421875, + "learning_rate": 3.482557402463078e-06, + "loss": 0.0005, + "num_tokens": 2247277570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7537765639668857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034750852405496897, + "kl": 0.01263427734375, + "learning_rate": 3.4780398684370107e-06, + "loss": 0.0005, + "num_tokens": 2247848034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7539472561235812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008875572390234239, + "kl": 0.0134735107421875, + "learning_rate": 3.4735246494713605e-06, + "loss": 0.0005, + "num_tokens": 2248416098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7541179482802766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009357908570426686, + "kl": 0.013214111328125, + "learning_rate": 3.4690117471688667e-06, + "loss": 0.0005, + "num_tokens": 2248984242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.754288640436972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009269802351811439, + "kl": 0.0131683349609375, + "learning_rate": 3.464501163131455e-06, + "loss": 0.0005, + "num_tokens": 2249559394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7544593325936674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014008109375568236, + "kl": 0.0135650634765625, + "learning_rate": 3.459992898960206e-06, + "loss": 0.0005, + "num_tokens": 2250122146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7546300247503627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015694933337204365, + "kl": 0.0126495361328125, + "learning_rate": 3.4554869562553973e-06, + "loss": 0.0005, + "num_tokens": 2250686242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7548007169070581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007164137637155303, + "kl": 0.014007568359375, + "learning_rate": 3.4509833366164746e-06, + "loss": 0.0006, + "num_tokens": 2251247954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7549714090637535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032258661237940476, + "kl": 0.0133514404296875, + "learning_rate": 3.4464820416420653e-06, + "loss": 0.0005, + "num_tokens": 2251819234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7551421012204489, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003189907669643995, + "kl": 0.01385498046875, + "learning_rate": 3.4419830729299632e-06, + "loss": 0.0006, + "num_tokens": 2252381874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7553127933771443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005511044874997939, + "kl": 0.0132904052734375, + "learning_rate": 3.4374864320771364e-06, + "loss": 0.0005, + "num_tokens": 2252944674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7554834855338397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021189332522015487, + "kl": 0.012939453125, + "learning_rate": 3.432992120679731e-06, + "loss": 0.0005, + "num_tokens": 2253509650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7556541776905351, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016570971861422316, + "kl": 0.0132293701171875, + "learning_rate": 3.4285001403330732e-06, + "loss": 0.0005, + "num_tokens": 2254081266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7558248698472305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001237148917829743, + "kl": 0.0128173828125, + "learning_rate": 3.4240104926316466e-06, + "loss": 0.0005, + "num_tokens": 2254642370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7559955620039259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009122283404362115, + "kl": 0.012786865234375, + "learning_rate": 3.419523179169113e-06, + "loss": 0.0005, + "num_tokens": 2255204786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7561662541606213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007012299514022839, + "kl": 0.012725830078125, + "learning_rate": 3.415038201538309e-06, + "loss": 0.0005, + "num_tokens": 2255766434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7563369463173167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028744786862638943, + "kl": 0.0133514404296875, + "learning_rate": 3.4105555613312425e-06, + "loss": 0.0005, + "num_tokens": 2256329218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7565076384740121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005632675499844061, + "kl": 0.01239013671875, + "learning_rate": 3.406075260139088e-06, + "loss": 0.0005, + "num_tokens": 2256893250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7566783306307076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004083650625142756, + "kl": 0.0125885009765625, + "learning_rate": 3.401597299552184e-06, + "loss": 0.0005, + "num_tokens": 2257458194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.756849022787403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003742077927072301, + "kl": 0.0137481689453125, + "learning_rate": 3.3971216811600518e-06, + "loss": 0.0005, + "num_tokens": 2258019666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7570197149440984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003372269125509434, + "kl": 0.012451171875, + "learning_rate": 3.3926484065513744e-06, + "loss": 0.0005, + "num_tokens": 2258580450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7571904071007938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003064080272014982, + "kl": 0.01324462890625, + "learning_rate": 3.388177477314002e-06, + "loss": 0.0005, + "num_tokens": 2259145586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7573610992574891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008213430439401649, + "kl": 0.0126495361328125, + "learning_rate": 3.3837088950349482e-06, + "loss": 0.0005, + "num_tokens": 2259713298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7575317914141845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016515593324360129, + "kl": 0.012786865234375, + "learning_rate": 3.3792426613004026e-06, + "loss": 0.0005, + "num_tokens": 2260275058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7577024835708799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004707203191261913, + "kl": 0.012725830078125, + "learning_rate": 3.3747787776957197e-06, + "loss": 0.0005, + "num_tokens": 2260842130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7578731757275753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001655702908669556, + "kl": 0.012847900390625, + "learning_rate": 3.3703172458054114e-06, + "loss": 0.0005, + "num_tokens": 2261410418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7580438678842707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003284789383334912, + "kl": 0.013519287109375, + "learning_rate": 3.3658580672131646e-06, + "loss": 0.0005, + "num_tokens": 2261977250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7582145600409661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006098118681360218, + "kl": 0.013092041015625, + "learning_rate": 3.3614012435018226e-06, + "loss": 0.0005, + "num_tokens": 2262541090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7583852521976615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006039226037966884, + "kl": 0.01287841796875, + "learning_rate": 3.3569467762534037e-06, + "loss": 0.0005, + "num_tokens": 2263102930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7585559443543569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009074999318718343, + "kl": 0.012786865234375, + "learning_rate": 3.3524946670490753e-06, + "loss": 0.0005, + "num_tokens": 2263672610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7587266365110523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008876278603274792, + "kl": 0.0126190185546875, + "learning_rate": 3.348044917469182e-06, + "loss": 0.0005, + "num_tokens": 2264234210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7588973286677477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003117160848337806, + "kl": 0.0125732421875, + "learning_rate": 3.3435975290932176e-06, + "loss": 0.0005, + "num_tokens": 2264812834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7590680208244431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039908787425526005, + "kl": 0.0133209228515625, + "learning_rate": 3.339152503499852e-06, + "loss": 0.0005, + "num_tokens": 2265375810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7592387129811385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005804311423383722, + "kl": 0.01275634765625, + "learning_rate": 3.3347098422669e-06, + "loss": 0.0005, + "num_tokens": 2265938146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.759409405137834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007916039181097472, + "kl": 0.0126953125, + "learning_rate": 3.3302695469713553e-06, + "loss": 0.0005, + "num_tokens": 2266502962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7595800972945294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033000639161663995, + "kl": 0.0133819580078125, + "learning_rate": 3.3258316191893547e-06, + "loss": 0.0005, + "num_tokens": 2267065970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7597507894512248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004703691740933617, + "kl": 0.0127105712890625, + "learning_rate": 3.321396060496209e-06, + "loss": 0.0005, + "num_tokens": 2267630610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7599214816079202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001268219513481312, + "kl": 0.01312255859375, + "learning_rate": 3.316962872466375e-06, + "loss": 0.0005, + "num_tokens": 2268193922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7600921737646155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013982743652942884, + "kl": 0.0128326416015625, + "learning_rate": 3.3125320566734796e-06, + "loss": 0.0005, + "num_tokens": 2268767922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7602628659213109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014332562208988313, + "kl": 0.012420654296875, + "learning_rate": 3.3081036146903013e-06, + "loss": 0.0005, + "num_tokens": 2269332162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7604335580780063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013086184689845855, + "kl": 0.0125274658203125, + "learning_rate": 3.303677548088773e-06, + "loss": 0.0005, + "num_tokens": 2269892850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7606042502347017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0045836388162440135, + "kl": 0.0133819580078125, + "learning_rate": 3.2992538584399913e-06, + "loss": 0.0005, + "num_tokens": 2270459138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7607749423913971, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005457646198619825, + "kl": 0.0124664306640625, + "learning_rate": 3.2948325473142105e-06, + "loss": 0.0005, + "num_tokens": 2271027314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7609456345480925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007150281421253863, + "kl": 0.013458251953125, + "learning_rate": 3.290413616280832e-06, + "loss": 0.0005, + "num_tokens": 2271589522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7611163267047879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001470869307000098, + "kl": 0.0133514404296875, + "learning_rate": 3.285997066908415e-06, + "loss": 0.0005, + "num_tokens": 2272156754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7612870188614833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025704337910038757, + "kl": 0.012725830078125, + "learning_rate": 3.2815829007646783e-06, + "loss": 0.0005, + "num_tokens": 2272719554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7614577110181787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046627354710087005, + "kl": 0.01348876953125, + "learning_rate": 3.27717111941649e-06, + "loss": 0.0005, + "num_tokens": 2273281138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7616284031748741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004783712293559245, + "kl": 0.0136566162109375, + "learning_rate": 3.2727617244298805e-06, + "loss": 0.0005, + "num_tokens": 2273846402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7617990953315695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034974215567860597, + "kl": 0.012725830078125, + "learning_rate": 3.2683547173700135e-06, + "loss": 0.0005, + "num_tokens": 2274417682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7619697874882649, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004468344382326279, + "kl": 0.01318359375, + "learning_rate": 3.2639500998012232e-06, + "loss": 0.0005, + "num_tokens": 2274979666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7621404796449603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005744865808713352, + "kl": 0.0137176513671875, + "learning_rate": 3.2595478732869902e-06, + "loss": 0.0005, + "num_tokens": 2275551890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7623111718016558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005193014224243328, + "kl": 0.01397705078125, + "learning_rate": 3.255148039389948e-06, + "loss": 0.0006, + "num_tokens": 2276124210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7624818639583512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005735432371095622, + "kl": 0.0130157470703125, + "learning_rate": 3.250750599671878e-06, + "loss": 0.0005, + "num_tokens": 2276690402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7626525561150466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044963387887424146, + "kl": 0.0138092041015625, + "learning_rate": 3.2463555556937075e-06, + "loss": 0.0006, + "num_tokens": 2277260274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7628232482717419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009593269007781733, + "kl": 0.012725830078125, + "learning_rate": 3.2419629090155215e-06, + "loss": 0.0005, + "num_tokens": 2277824978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7629939404284373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003889870576586012, + "kl": 0.01373291015625, + "learning_rate": 3.2375726611965552e-06, + "loss": 0.0005, + "num_tokens": 2278392834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7631646325851327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037811059500906326, + "kl": 0.013214111328125, + "learning_rate": 3.233184813795185e-06, + "loss": 0.0005, + "num_tokens": 2278970066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7633353247418281, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000521953160732711, + "kl": 0.01287841796875, + "learning_rate": 3.2287993683689347e-06, + "loss": 0.0005, + "num_tokens": 2279532722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7635060168985235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007891684051365643, + "kl": 0.0130615234375, + "learning_rate": 3.224416326474482e-06, + "loss": 0.0005, + "num_tokens": 2280098754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7636767090552189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006176286198077784, + "kl": 0.0131988525390625, + "learning_rate": 3.220035689667652e-06, + "loss": 0.0005, + "num_tokens": 2280667506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7638474012119143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008993444626051822, + "kl": 0.01275634765625, + "learning_rate": 3.2156574595034107e-06, + "loss": 0.0005, + "num_tokens": 2281237202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7640180933686097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007989648149295808, + "kl": 0.0136260986328125, + "learning_rate": 3.211281637535866e-06, + "loss": 0.0005, + "num_tokens": 2281804770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7641887855253051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006255045771988948, + "kl": 0.012359619140625, + "learning_rate": 3.2069082253182813e-06, + "loss": 0.0005, + "num_tokens": 2282366370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7643594776820005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005474418758946191, + "kl": 0.0128021240234375, + "learning_rate": 3.2025372244030638e-06, + "loss": 0.0005, + "num_tokens": 2282932946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7645301698386959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001343234883343116, + "kl": 0.0130462646484375, + "learning_rate": 3.1981686363417573e-06, + "loss": 0.0005, + "num_tokens": 2283497522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7647008619953913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004234818882353139, + "kl": 0.0124053955078125, + "learning_rate": 3.1938024626850507e-06, + "loss": 0.0005, + "num_tokens": 2284066898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7648715541520867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006482204121908506, + "kl": 0.013336181640625, + "learning_rate": 3.189438704982779e-06, + "loss": 0.0005, + "num_tokens": 2284632290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7650422463087821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00013454407047155335, + "kl": 0.0122528076171875, + "learning_rate": 3.185077364783923e-06, + "loss": 0.0005, + "num_tokens": 2285194146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7652129384654776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005604985690756352, + "kl": 0.0133819580078125, + "learning_rate": 3.1807184436365944e-06, + "loss": 0.0005, + "num_tokens": 2285764194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.765383630622173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0061845615560838126, + "kl": 0.0143585205078125, + "learning_rate": 3.176361943088061e-06, + "loss": 0.0006, + "num_tokens": 2286331202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7655543227788683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008123775616548438, + "kl": 0.012969970703125, + "learning_rate": 3.172007864684714e-06, + "loss": 0.0005, + "num_tokens": 2286897714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7657250149355637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003159005037861419, + "kl": 0.0128021240234375, + "learning_rate": 3.167656209972103e-06, + "loss": 0.0005, + "num_tokens": 2287463954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7658957070922591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024738610179367326, + "kl": 0.0127410888671875, + "learning_rate": 3.1633069804949004e-06, + "loss": 0.0005, + "num_tokens": 2288026994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7660663992489545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010203139783924151, + "kl": 0.021209716796875, + "learning_rate": 3.1589601777969314e-06, + "loss": 0.0008, + "num_tokens": 2288627810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7662370914056499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0040222123168442885, + "kl": 0.0134735107421875, + "learning_rate": 3.1546158034211504e-06, + "loss": 0.0005, + "num_tokens": 2289191650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7664077835623453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012401043815789663, + "kl": 0.0132598876953125, + "learning_rate": 3.150273858909657e-06, + "loss": 0.0005, + "num_tokens": 2289762770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7665784757190407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000902500927917116, + "kl": 0.0131378173828125, + "learning_rate": 3.1459343458036807e-06, + "loss": 0.0005, + "num_tokens": 2290340850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7667491678757361, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025173424606950673, + "kl": 0.0134735107421875, + "learning_rate": 3.1415972656435966e-06, + "loss": 0.0005, + "num_tokens": 2290908290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7669198600324315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003677713382701061, + "kl": 0.0129852294921875, + "learning_rate": 3.1372626199689062e-06, + "loss": 0.0005, + "num_tokens": 2291472466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7670905521891269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006461601394889089, + "kl": 0.0125885009765625, + "learning_rate": 3.132930410318258e-06, + "loss": 0.0005, + "num_tokens": 2292040082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7672612443458223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031523051749010996, + "kl": 0.0128173828125, + "learning_rate": 3.128600638229423e-06, + "loss": 0.0005, + "num_tokens": 2292612066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7674319365025177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004957297934162137, + "kl": 0.0137939453125, + "learning_rate": 3.1242733052393227e-06, + "loss": 0.0006, + "num_tokens": 2293178082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7676026286592131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005896224419925584, + "kl": 0.0135955810546875, + "learning_rate": 3.1199484128839965e-06, + "loss": 0.0005, + "num_tokens": 2293744722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7677733208159085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006260592542375663, + "kl": 0.0135345458984375, + "learning_rate": 3.115625962698631e-06, + "loss": 0.0005, + "num_tokens": 2294309730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.767944012972604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009769061810855316, + "kl": 0.0135345458984375, + "learning_rate": 3.111305956217533e-06, + "loss": 0.0005, + "num_tokens": 2294880290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7681147051292994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014339459874668692, + "kl": 0.0130615234375, + "learning_rate": 3.1069883949741575e-06, + "loss": 0.0005, + "num_tokens": 2295440434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7682853972859947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001736646752960494, + "kl": 0.0141754150390625, + "learning_rate": 3.102673280501074e-06, + "loss": 0.0006, + "num_tokens": 2296009922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7684560894426901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007131692589094585, + "kl": 0.0126495361328125, + "learning_rate": 3.0983606143300006e-06, + "loss": 0.0005, + "num_tokens": 2296571794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7686267815993855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012659876567896028, + "kl": 0.0128021240234375, + "learning_rate": 3.0940503979917715e-06, + "loss": 0.0005, + "num_tokens": 2297136546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7687974737560809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011286866715896804, + "kl": 0.0135498046875, + "learning_rate": 3.0897426330163594e-06, + "loss": 0.0005, + "num_tokens": 2297706450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7689681659127763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005116420207632428, + "kl": 0.01373291015625, + "learning_rate": 3.0854373209328704e-06, + "loss": 0.0005, + "num_tokens": 2298281122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7691388580694717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006750791811431124, + "kl": 0.0135345458984375, + "learning_rate": 3.0811344632695316e-06, + "loss": 0.0005, + "num_tokens": 2298849298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7693095502261671, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020131979077409702, + "kl": 0.0133819580078125, + "learning_rate": 3.0768340615536975e-06, + "loss": 0.0005, + "num_tokens": 2299415010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7694802423828625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047969623799682304, + "kl": 0.013916015625, + "learning_rate": 3.0725361173118604e-06, + "loss": 0.0006, + "num_tokens": 2299981298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7696509345395579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002899667400874289, + "kl": 0.0143585205078125, + "learning_rate": 3.0682406320696377e-06, + "loss": 0.0006, + "num_tokens": 2300541506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7698216266962533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005976596707770892, + "kl": 0.0141448974609375, + "learning_rate": 3.063947607351768e-06, + "loss": 0.0006, + "num_tokens": 2301103730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7699923188529487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007240187642982537, + "kl": 0.01318359375, + "learning_rate": 3.059657044682117e-06, + "loss": 0.0005, + "num_tokens": 2301669826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7701630110096441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002404279638252621, + "kl": 0.012969970703125, + "learning_rate": 3.0553689455836844e-06, + "loss": 0.0005, + "num_tokens": 2302238818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7703337031663395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014198352343532295, + "kl": 0.013336181640625, + "learning_rate": 3.051083311578592e-06, + "loss": 0.0005, + "num_tokens": 2302804530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7705043953230349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017231195423991075, + "kl": 0.0128173828125, + "learning_rate": 3.0468001441880823e-06, + "loss": 0.0005, + "num_tokens": 2303368546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7706750874797303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001149750678327097, + "kl": 0.0130462646484375, + "learning_rate": 3.042519444932522e-06, + "loss": 0.0005, + "num_tokens": 2303932162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7708457796364258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009053326856488343, + "kl": 0.01385498046875, + "learning_rate": 3.038241215331409e-06, + "loss": 0.0006, + "num_tokens": 2304496882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7710164717931212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014319015212375144, + "kl": 0.01348876953125, + "learning_rate": 3.0339654569033616e-06, + "loss": 0.0005, + "num_tokens": 2305055650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7711871639498165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009579021109496572, + "kl": 0.0130615234375, + "learning_rate": 3.029692171166119e-06, + "loss": 0.0005, + "num_tokens": 2305619442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7713578561065119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001005416352961875, + "kl": 0.012969970703125, + "learning_rate": 3.0254213596365367e-06, + "loss": 0.0005, + "num_tokens": 2306184322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7715285482632073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005614261553495489, + "kl": 0.01373291015625, + "learning_rate": 3.021153023830605e-06, + "loss": 0.0005, + "num_tokens": 2306748482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7716992404199027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007356961909608719, + "kl": 0.013641357421875, + "learning_rate": 3.016887165263428e-06, + "loss": 0.0005, + "num_tokens": 2307309922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7718699325765981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002273814521597593, + "kl": 0.012664794921875, + "learning_rate": 3.012623785449238e-06, + "loss": 0.0005, + "num_tokens": 2307870850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7720406247332935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004364643533328351, + "kl": 0.01361083984375, + "learning_rate": 3.0083628859013682e-06, + "loss": 0.0005, + "num_tokens": 2308432338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7722113168899889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020279200121675945, + "kl": 0.0141143798828125, + "learning_rate": 3.0041044681322895e-06, + "loss": 0.0006, + "num_tokens": 2308996626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7723820090466843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002448419535547807, + "kl": 0.0133209228515625, + "learning_rate": 2.999848533653588e-06, + "loss": 0.0005, + "num_tokens": 2309568706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7725527012033797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002316374895146775, + "kl": 0.0134735107421875, + "learning_rate": 2.9955950839759685e-06, + "loss": 0.0005, + "num_tokens": 2310127570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7727233933600751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008115305166013066, + "kl": 0.0137176513671875, + "learning_rate": 2.991344120609251e-06, + "loss": 0.0005, + "num_tokens": 2310690210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7728940855167705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005009557780403858, + "kl": 0.013092041015625, + "learning_rate": 2.987095645062368e-06, + "loss": 0.0005, + "num_tokens": 2311255826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7730647776734659, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011931913939472398, + "kl": 0.013519287109375, + "learning_rate": 2.982849658843381e-06, + "loss": 0.0005, + "num_tokens": 2311818626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7732354698301613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005970443942621956, + "kl": 0.0132293701171875, + "learning_rate": 2.978606163459462e-06, + "loss": 0.0005, + "num_tokens": 2312380338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7734061619868567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009523380597927167, + "kl": 0.013946533203125, + "learning_rate": 2.9743651604168987e-06, + "loss": 0.0006, + "num_tokens": 2312944930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7735768541435522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006288750281042136, + "kl": 0.0133056640625, + "learning_rate": 2.970126651221089e-06, + "loss": 0.0005, + "num_tokens": 2313507266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7737475463002476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001295884349748392, + "kl": 0.0130157470703125, + "learning_rate": 2.965890637376554e-06, + "loss": 0.0005, + "num_tokens": 2314074386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7739182384569429, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007008048023457711, + "kl": 0.0138702392578125, + "learning_rate": 2.961657120386929e-06, + "loss": 0.0006, + "num_tokens": 2314638962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7740889306136383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001744385546401767, + "kl": 0.0136871337890625, + "learning_rate": 2.957426101754958e-06, + "loss": 0.0005, + "num_tokens": 2315203538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7742596227703337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023683781259319423, + "kl": 0.0127410888671875, + "learning_rate": 2.9531975829824946e-06, + "loss": 0.0005, + "num_tokens": 2315768802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7744303149270291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044505736880123776, + "kl": 0.0136871337890625, + "learning_rate": 2.948971565570514e-06, + "loss": 0.0005, + "num_tokens": 2316337938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7746010070837245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006127248882456264, + "kl": 0.0133819580078125, + "learning_rate": 2.944748051019104e-06, + "loss": 0.0005, + "num_tokens": 2316899538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7747716992404199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001241692719031133, + "kl": 0.01324462890625, + "learning_rate": 2.9405270408274545e-06, + "loss": 0.0005, + "num_tokens": 2317460114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7749423913971153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012794736141207709, + "kl": 0.0135650634765625, + "learning_rate": 2.9363085364938717e-06, + "loss": 0.0005, + "num_tokens": 2318027730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7751130835538107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010075512089917215, + "kl": 0.01348876953125, + "learning_rate": 2.9320925395157728e-06, + "loss": 0.0005, + "num_tokens": 2318591458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7752837757105061, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019527148140136683, + "kl": 0.01263427734375, + "learning_rate": 2.9278790513896884e-06, + "loss": 0.0005, + "num_tokens": 2319157778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7754544678672015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012929311725642548, + "kl": 0.0133514404296875, + "learning_rate": 2.923668073611252e-06, + "loss": 0.0005, + "num_tokens": 2319719922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7756251600238969, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008096394814034564, + "kl": 0.013336181640625, + "learning_rate": 2.9194596076752067e-06, + "loss": 0.0005, + "num_tokens": 2320286290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7757958521805923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003750996153856516, + "kl": 0.0134429931640625, + "learning_rate": 2.9152536550754062e-06, + "loss": 0.0005, + "num_tokens": 2320849618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7759665443372877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002526525014958194, + "kl": 0.01318359375, + "learning_rate": 2.911050217304817e-06, + "loss": 0.0005, + "num_tokens": 2321412802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7761372364939831, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001635855072484907, + "kl": 0.0126953125, + "learning_rate": 2.9068492958555016e-06, + "loss": 0.0005, + "num_tokens": 2321978546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7763079286506785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000573532963533225, + "kl": 0.0127105712890625, + "learning_rate": 2.9026508922186416e-06, + "loss": 0.0005, + "num_tokens": 2322546354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.776478620807374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003016247349231304, + "kl": 0.0126190185546875, + "learning_rate": 2.898455007884512e-06, + "loss": 0.0005, + "num_tokens": 2323114386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7766493129640692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031105097337374987, + "kl": 0.01318359375, + "learning_rate": 2.8942616443425064e-06, + "loss": 0.0005, + "num_tokens": 2323674930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7768200051207647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012162740930319138, + "kl": 0.0132293701171875, + "learning_rate": 2.8900708030811108e-06, + "loss": 0.0005, + "num_tokens": 2324234978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7769906972774601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000986690536392959, + "kl": 0.013092041015625, + "learning_rate": 2.8858824855879307e-06, + "loss": 0.0005, + "num_tokens": 2324803010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7771613894341555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006038817324313161, + "kl": 0.013580322265625, + "learning_rate": 2.8816966933496595e-06, + "loss": 0.0005, + "num_tokens": 2325362498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7773320815908509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006542293840549377, + "kl": 0.01287841796875, + "learning_rate": 2.87751342785211e-06, + "loss": 0.0005, + "num_tokens": 2325928050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7775027737475463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004404271733105794, + "kl": 0.012847900390625, + "learning_rate": 2.8733326905801827e-06, + "loss": 0.0005, + "num_tokens": 2326493346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7776734659042417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003971100862103623, + "kl": 0.0133819580078125, + "learning_rate": 2.8691544830178954e-06, + "loss": 0.0005, + "num_tokens": 2327060306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7778441580609371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043966940166567073, + "kl": 0.0125732421875, + "learning_rate": 2.864978806648355e-06, + "loss": 0.0005, + "num_tokens": 2327621618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7780148502176325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013534690178978127, + "kl": 0.0132904052734375, + "learning_rate": 2.860805662953783e-06, + "loss": 0.0005, + "num_tokens": 2328192354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7781855423743279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005360532369293098, + "kl": 0.0133209228515625, + "learning_rate": 2.8566350534154865e-06, + "loss": 0.0005, + "num_tokens": 2328755650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7783562345310233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041681968009427996, + "kl": 0.01361083984375, + "learning_rate": 2.852466979513888e-06, + "loss": 0.0005, + "num_tokens": 2329327442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7785269266877187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004309145445746879, + "kl": 0.012939453125, + "learning_rate": 2.8483014427284973e-06, + "loss": 0.0005, + "num_tokens": 2329891042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7786976188444141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008540988681468803, + "kl": 0.0130767822265625, + "learning_rate": 2.844138444537937e-06, + "loss": 0.0005, + "num_tokens": 2330453394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7788683110011095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012122993603808244, + "kl": 0.0132293701171875, + "learning_rate": 2.839977986419914e-06, + "loss": 0.0005, + "num_tokens": 2331017586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7790390031578049, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017190224721696343, + "kl": 0.013946533203125, + "learning_rate": 2.835820069851243e-06, + "loss": 0.0006, + "num_tokens": 2331589874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7792096953145004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016091748942149108, + "kl": 0.013671875, + "learning_rate": 2.8316646963078386e-06, + "loss": 0.0005, + "num_tokens": 2332157746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7793803874711956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000728159634427437, + "kl": 0.0137786865234375, + "learning_rate": 2.8275118672647063e-06, + "loss": 0.0006, + "num_tokens": 2332722642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.779551079627891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014793197520781055, + "kl": 0.013336181640625, + "learning_rate": 2.823361584195944e-06, + "loss": 0.0005, + "num_tokens": 2333285106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7797217717845865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006100592471767756, + "kl": 0.01287841796875, + "learning_rate": 2.8192138485747587e-06, + "loss": 0.0005, + "num_tokens": 2333851394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7798924639412819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013629242485902096, + "kl": 0.01318359375, + "learning_rate": 2.815068661873449e-06, + "loss": 0.0005, + "num_tokens": 2334419106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7800631560979773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005979264029654564, + "kl": 0.01336669921875, + "learning_rate": 2.8109260255634018e-06, + "loss": 0.0005, + "num_tokens": 2334984418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7802338482546727, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011410240899980162, + "kl": 0.0130157470703125, + "learning_rate": 2.8067859411151034e-06, + "loss": 0.0005, + "num_tokens": 2335548850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7804045404113681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037388828696111975, + "kl": 0.0123443603515625, + "learning_rate": 2.8026484099981354e-06, + "loss": 0.0005, + "num_tokens": 2336112690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7805752325680635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003247536004031614, + "kl": 0.01397705078125, + "learning_rate": 2.798513433681175e-06, + "loss": 0.0006, + "num_tokens": 2336679682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7807459247247589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036776286239438864, + "kl": 0.01348876953125, + "learning_rate": 2.7943810136319873e-06, + "loss": 0.0005, + "num_tokens": 2337241042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7809166168814543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013004458573175723, + "kl": 0.01361083984375, + "learning_rate": 2.790251151317428e-06, + "loss": 0.0005, + "num_tokens": 2337806322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7810873090381497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000853288864791555, + "kl": 0.0129241943359375, + "learning_rate": 2.7861238482034535e-06, + "loss": 0.0005, + "num_tokens": 2338368754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7812580011948451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036647624703947744, + "kl": 0.0134429931640625, + "learning_rate": 2.78199910575511e-06, + "loss": 0.0005, + "num_tokens": 2338940290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7814286933515405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006702035718301048, + "kl": 0.0136871337890625, + "learning_rate": 2.777876925436529e-06, + "loss": 0.0005, + "num_tokens": 2339506626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7815993855082359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009715141615108086, + "kl": 0.0135650634765625, + "learning_rate": 2.7737573087109336e-06, + "loss": 0.0005, + "num_tokens": 2340075698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7817700776649313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020152883874110207, + "kl": 0.01324462890625, + "learning_rate": 2.76964025704064e-06, + "loss": 0.0005, + "num_tokens": 2340642130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7819407698216267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004617138518354337, + "kl": 0.0137786865234375, + "learning_rate": 2.7655257718870544e-06, + "loss": 0.0006, + "num_tokens": 2341203554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.782111461978322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001794889335307657, + "kl": 0.013519287109375, + "learning_rate": 2.761413854710677e-06, + "loss": 0.0005, + "num_tokens": 2341766466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7822821541350174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036229981177668527, + "kl": 0.0124053955078125, + "learning_rate": 2.757304506971077e-06, + "loss": 0.0005, + "num_tokens": 2342334002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7824528462917129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004030644847199898, + "kl": 0.01287841796875, + "learning_rate": 2.7531977301269298e-06, + "loss": 0.0005, + "num_tokens": 2342902066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7826235384484083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009844516726613397, + "kl": 0.012359619140625, + "learning_rate": 2.7490935256359953e-06, + "loss": 0.0005, + "num_tokens": 2343468818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7827942306051037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020242258898562642, + "kl": 0.0129547119140625, + "learning_rate": 2.744991894955118e-06, + "loss": 0.0005, + "num_tokens": 2344036786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7829649227617991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007087842307375618, + "kl": 0.0126953125, + "learning_rate": 2.740892839540229e-06, + "loss": 0.0005, + "num_tokens": 2344600978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7831356149184945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020407119330802394, + "kl": 0.013092041015625, + "learning_rate": 2.736796360846339e-06, + "loss": 0.0005, + "num_tokens": 2345160354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7833063070751899, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005203956208398987, + "kl": 0.01324462890625, + "learning_rate": 2.732702460327554e-06, + "loss": 0.0005, + "num_tokens": 2345725666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7834769992318853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042135704121352226, + "kl": 0.0133819580078125, + "learning_rate": 2.728611139437065e-06, + "loss": 0.0005, + "num_tokens": 2346291570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7836476913885807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007049802479896472, + "kl": 0.0129547119140625, + "learning_rate": 2.7245223996271398e-06, + "loss": 0.0005, + "num_tokens": 2346858882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7838183835452761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007883349032641403, + "kl": 0.0132904052734375, + "learning_rate": 2.7204362423491294e-06, + "loss": 0.0005, + "num_tokens": 2347424178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7839890757019715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017084639370248261, + "kl": 0.0131378173828125, + "learning_rate": 2.716352669053476e-06, + "loss": 0.0005, + "num_tokens": 2347996594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7841597678586669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004842699263055238, + "kl": 0.0133819580078125, + "learning_rate": 2.712271681189703e-06, + "loss": 0.0005, + "num_tokens": 2348557394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7843304600153623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021851492923372594, + "kl": 0.01318359375, + "learning_rate": 2.7081932802064113e-06, + "loss": 0.0005, + "num_tokens": 2349129090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7845011521720577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003515478671565057, + "kl": 0.0140380859375, + "learning_rate": 2.704117467551284e-06, + "loss": 0.0006, + "num_tokens": 2349698098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7846718443287531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005595150259641032, + "kl": 0.01300048828125, + "learning_rate": 2.7000442446710885e-06, + "loss": 0.0005, + "num_tokens": 2350263346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7848425364854484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026789981692267755, + "kl": 0.01263427734375, + "learning_rate": 2.6959736130116763e-06, + "loss": 0.0005, + "num_tokens": 2350829202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7850132286421438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003472968852557939, + "kl": 0.0131378173828125, + "learning_rate": 2.691905574017971e-06, + "loss": 0.0005, + "num_tokens": 2351397106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7851839207988393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012526916100378735, + "kl": 0.012420654296875, + "learning_rate": 2.6878401291339774e-06, + "loss": 0.0005, + "num_tokens": 2351963586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7853546129555347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006972093694695001, + "kl": 0.013458251953125, + "learning_rate": 2.683777279802784e-06, + "loss": 0.0005, + "num_tokens": 2352530914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7855253051122301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009527556780076233, + "kl": 0.013885498046875, + "learning_rate": 2.6797170274665584e-06, + "loss": 0.0006, + "num_tokens": 2353099074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7856959972689255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004686910809479677, + "kl": 0.01336669921875, + "learning_rate": 2.6756593735665413e-06, + "loss": 0.0005, + "num_tokens": 2353673986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7858666894256209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008311041930584625, + "kl": 0.01312255859375, + "learning_rate": 2.67160431954305e-06, + "loss": 0.0005, + "num_tokens": 2354235906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7860373815823163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003017661627177292, + "kl": 0.0127716064453125, + "learning_rate": 2.667551866835485e-06, + "loss": 0.0005, + "num_tokens": 2354795746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7862080737390117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026449916863369947, + "kl": 0.012847900390625, + "learning_rate": 2.6635020168823243e-06, + "loss": 0.0005, + "num_tokens": 2355367074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7863787658957071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001152148950824943, + "kl": 0.013031005859375, + "learning_rate": 2.6594547711211126e-06, + "loss": 0.0005, + "num_tokens": 2355932258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7865494580524025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005864999434444059, + "kl": 0.01385498046875, + "learning_rate": 2.6554101309884817e-06, + "loss": 0.0006, + "num_tokens": 2356495890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7867201502090979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014349054389801053, + "kl": 0.013275146484375, + "learning_rate": 2.6513680979201263e-06, + "loss": 0.0005, + "num_tokens": 2357062002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7868908423657933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009058089254624771, + "kl": 0.0141448974609375, + "learning_rate": 2.64732867335083e-06, + "loss": 0.0006, + "num_tokens": 2357624274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7870615345224887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018351785874271706, + "kl": 0.0130615234375, + "learning_rate": 2.6432918587144342e-06, + "loss": 0.0005, + "num_tokens": 2358191986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7872322266791841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006550772493946738, + "kl": 0.013275146484375, + "learning_rate": 2.63925765544387e-06, + "loss": 0.0005, + "num_tokens": 2358755618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7874029188358795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001083890956725656, + "kl": 0.0134124755859375, + "learning_rate": 2.635226064971127e-06, + "loss": 0.0005, + "num_tokens": 2359317602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.787573610992575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008421276778784821, + "kl": 0.0133209228515625, + "learning_rate": 2.631197088727282e-06, + "loss": 0.0005, + "num_tokens": 2359887746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7877443031492702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001332212561191712, + "kl": 0.013702392578125, + "learning_rate": 2.6271707281424685e-06, + "loss": 0.0005, + "num_tokens": 2360451890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7879149953059656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002627554701572858, + "kl": 0.0133209228515625, + "learning_rate": 2.6231469846459055e-06, + "loss": 0.0005, + "num_tokens": 2361020562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.788085687462661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014483959372327863, + "kl": 0.0135955810546875, + "learning_rate": 2.6191258596658707e-06, + "loss": 0.0005, + "num_tokens": 2361585058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7882563796193565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011551448018882811, + "kl": 0.014068603515625, + "learning_rate": 2.6151073546297245e-06, + "loss": 0.0006, + "num_tokens": 2362150610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7884270717760519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005681324421576946, + "kl": 0.013824462890625, + "learning_rate": 2.611091470963886e-06, + "loss": 0.0006, + "num_tokens": 2362719154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7885977639327473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011144940385173954, + "kl": 0.013092041015625, + "learning_rate": 2.607078210093853e-06, + "loss": 0.0005, + "num_tokens": 2363279410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7887684560894427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008164189423988456, + "kl": 0.01318359375, + "learning_rate": 2.6030675734441833e-06, + "loss": 0.0005, + "num_tokens": 2363839938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7889391482461381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004091100259270108, + "kl": 0.0134735107421875, + "learning_rate": 2.599059562438515e-06, + "loss": 0.0005, + "num_tokens": 2364409970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7891098404028335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007424773516868797, + "kl": 0.0133209228515625, + "learning_rate": 2.5950541784995397e-06, + "loss": 0.0005, + "num_tokens": 2364980354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7892805325595289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004911961489977844, + "kl": 0.01300048828125, + "learning_rate": 2.591051423049028e-06, + "loss": 0.0005, + "num_tokens": 2365546834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7894512247162243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007373805408481605, + "kl": 0.0129547119140625, + "learning_rate": 2.5870512975078166e-06, + "loss": 0.0005, + "num_tokens": 2366117122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7896219168729197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016651656064480987, + "kl": 0.0131072998046875, + "learning_rate": 2.583053803295802e-06, + "loss": 0.0005, + "num_tokens": 2366682434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7897926090296151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019203041839441294, + "kl": 0.0132293701171875, + "learning_rate": 2.5790589418319478e-06, + "loss": 0.0005, + "num_tokens": 2367244786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7899633011863105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005995970906067541, + "kl": 0.0135498046875, + "learning_rate": 2.5750667145342888e-06, + "loss": 0.0005, + "num_tokens": 2367812370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7901339933430059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001588400315834426, + "kl": 0.013336181640625, + "learning_rate": 2.571077122819925e-06, + "loss": 0.0005, + "num_tokens": 2368374018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7903046854997013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028695178227056074, + "kl": 0.0130157470703125, + "learning_rate": 2.5670901681050133e-06, + "loss": 0.0005, + "num_tokens": 2368937138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7904753776563966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024039612780972935, + "kl": 0.0127410888671875, + "learning_rate": 2.563105851804777e-06, + "loss": 0.0005, + "num_tokens": 2369504962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.790646069813092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008273561745752319, + "kl": 0.0130157470703125, + "learning_rate": 2.559124175333506e-06, + "loss": 0.0005, + "num_tokens": 2370068962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7908167619697875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010559063874681991, + "kl": 0.0135040283203125, + "learning_rate": 2.555145140104556e-06, + "loss": 0.0005, + "num_tokens": 2370631170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7909874541264829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002719159386198838, + "kl": 0.0132293701171875, + "learning_rate": 2.551168747530337e-06, + "loss": 0.0005, + "num_tokens": 2371197922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7911581462831783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006340834540657227, + "kl": 0.01348876953125, + "learning_rate": 2.5471949990223232e-06, + "loss": 0.0005, + "num_tokens": 2371770994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7913288384398737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011983093465844077, + "kl": 0.013031005859375, + "learning_rate": 2.543223895991054e-06, + "loss": 0.0005, + "num_tokens": 2372333042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7914995305965691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010414870931344758, + "kl": 0.0135498046875, + "learning_rate": 2.539255439846129e-06, + "loss": 0.0005, + "num_tokens": 2372896834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7916702227532645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004685566069890375, + "kl": 0.01300048828125, + "learning_rate": 2.5352896319962063e-06, + "loss": 0.0005, + "num_tokens": 2373468338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7918409149099599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010975645188346513, + "kl": 0.013671875, + "learning_rate": 2.5313264738490006e-06, + "loss": 0.0005, + "num_tokens": 2374026018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7920116070666553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015821622715494273, + "kl": 0.0128173828125, + "learning_rate": 2.527365966811294e-06, + "loss": 0.0005, + "num_tokens": 2374587714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7921822992233507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009585369907268455, + "kl": 0.013092041015625, + "learning_rate": 2.5234081122889243e-06, + "loss": 0.0005, + "num_tokens": 2375151570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7923529913800461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007792211089776091, + "kl": 0.01214599609375, + "learning_rate": 2.519452911686786e-06, + "loss": 0.0005, + "num_tokens": 2375719698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7925236835367415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002889297574177623, + "kl": 0.0133209228515625, + "learning_rate": 2.5155003664088282e-06, + "loss": 0.0005, + "num_tokens": 2376288530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7926943756934369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034873227276601147, + "kl": 0.0132293701171875, + "learning_rate": 2.5115504778580656e-06, + "loss": 0.0005, + "num_tokens": 2376850194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7928650678501323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006587620941064681, + "kl": 0.013580322265625, + "learning_rate": 2.5076032474365662e-06, + "loss": 0.0005, + "num_tokens": 2377412706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7930357600068277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000961534323052096, + "kl": 0.01336669921875, + "learning_rate": 2.5036586765454594e-06, + "loss": 0.0005, + "num_tokens": 2377975218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.793206452163523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034809993230247805, + "kl": 0.0130615234375, + "learning_rate": 2.4997167665849152e-06, + "loss": 0.0005, + "num_tokens": 2378538274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7933771443202184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006243984988911928, + "kl": 0.0124359130859375, + "learning_rate": 2.495777518954173e-06, + "loss": 0.0005, + "num_tokens": 2379107746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7935478364769138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010807140842097609, + "kl": 0.0132598876953125, + "learning_rate": 2.491840935051525e-06, + "loss": 0.0005, + "num_tokens": 2379672658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7937185286336093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007681440931468226, + "kl": 0.0128021240234375, + "learning_rate": 2.4879070162743192e-06, + "loss": 0.0005, + "num_tokens": 2380237170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7938892207903047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004532529921750191, + "kl": 0.0133209228515625, + "learning_rate": 2.483975764018952e-06, + "loss": 0.0005, + "num_tokens": 2380800738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7940599129470001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007273486135364628, + "kl": 0.0129852294921875, + "learning_rate": 2.4800471796808723e-06, + "loss": 0.0005, + "num_tokens": 2381368706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7942306051036955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008731874026070115, + "kl": 0.0135040283203125, + "learning_rate": 2.4761212646545895e-06, + "loss": 0.0005, + "num_tokens": 2381933890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7944012972603909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043283777890441996, + "kl": 0.013031005859375, + "learning_rate": 2.472198020333664e-06, + "loss": 0.0005, + "num_tokens": 2382496898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7945719894170863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007136866194150197, + "kl": 0.0134429931640625, + "learning_rate": 2.468277448110703e-06, + "loss": 0.0005, + "num_tokens": 2383068962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7947426815737817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003249849681161717, + "kl": 0.012451171875, + "learning_rate": 2.4643595493773652e-06, + "loss": 0.0005, + "num_tokens": 2383637170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7949133737304771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007487839118078668, + "kl": 0.013427734375, + "learning_rate": 2.460444325524367e-06, + "loss": 0.0005, + "num_tokens": 2384206546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7950840658871725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009860245067234825, + "kl": 0.013702392578125, + "learning_rate": 2.4565317779414734e-06, + "loss": 0.0005, + "num_tokens": 2384773314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7952547580438679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008109975759526447, + "kl": 0.0130615234375, + "learning_rate": 2.452621908017495e-06, + "loss": 0.0005, + "num_tokens": 2385336898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7954254502005633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012563818457794757, + "kl": 0.0132293701171875, + "learning_rate": 2.4487147171402927e-06, + "loss": 0.0005, + "num_tokens": 2385894354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7955961423572587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003135586004520404, + "kl": 0.01361083984375, + "learning_rate": 2.4448102066967785e-06, + "loss": 0.0005, + "num_tokens": 2386465266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7957668345139541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022643406687745078, + "kl": 0.0128326416015625, + "learning_rate": 2.440908378072918e-06, + "loss": 0.0005, + "num_tokens": 2387027794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7959375266706494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003384223990403949, + "kl": 0.01336669921875, + "learning_rate": 2.437009232653715e-06, + "loss": 0.0005, + "num_tokens": 2387599474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7961082188273448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030393496608842736, + "kl": 0.0131072998046875, + "learning_rate": 2.433112771823224e-06, + "loss": 0.0005, + "num_tokens": 2388171250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7962789109840402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006607272994921294, + "kl": 0.01239013671875, + "learning_rate": 2.4292189969645506e-06, + "loss": 0.0005, + "num_tokens": 2388745666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7964496031407357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023552551342417394, + "kl": 0.01324462890625, + "learning_rate": 2.425327909459846e-06, + "loss": 0.0005, + "num_tokens": 2389308834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7966202952974311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007868102211144068, + "kl": 0.0128021240234375, + "learning_rate": 2.4214395106903e-06, + "loss": 0.0005, + "num_tokens": 2389873058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7967909874541265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032487768781327715, + "kl": 0.0129547119140625, + "learning_rate": 2.417553802036161e-06, + "loss": 0.0005, + "num_tokens": 2390439106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7969616796108219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003973351505335119, + "kl": 0.0125274658203125, + "learning_rate": 2.4136707848767094e-06, + "loss": 0.0005, + "num_tokens": 2391000898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7971323717675173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002614353438004075, + "kl": 0.0133514404296875, + "learning_rate": 2.40979046059028e-06, + "loss": 0.0005, + "num_tokens": 2391565234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7973030639242127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008139490668353298, + "kl": 0.0132598876953125, + "learning_rate": 2.4059128305542444e-06, + "loss": 0.0005, + "num_tokens": 2392128706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7974737560809081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000732689789302829, + "kl": 0.0139312744140625, + "learning_rate": 2.4020378961450255e-06, + "loss": 0.0006, + "num_tokens": 2392698994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7976444482376035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000706530333398505, + "kl": 0.0122222900390625, + "learning_rate": 2.39816565873808e-06, + "loss": 0.0005, + "num_tokens": 2393266530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7978151403942989, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006656036920059855, + "kl": 0.01324462890625, + "learning_rate": 2.3942961197079185e-06, + "loss": 0.0005, + "num_tokens": 2393828738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7979858325509943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004537941365947599, + "kl": 0.0135345458984375, + "learning_rate": 2.3904292804280802e-06, + "loss": 0.0005, + "num_tokens": 2394395362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7981565247076897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005632767295796749, + "kl": 0.0133056640625, + "learning_rate": 2.386565142271162e-06, + "loss": 0.0005, + "num_tokens": 2394959842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7983272168643851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043175532178774595, + "kl": 0.0129241943359375, + "learning_rate": 2.3827037066087856e-06, + "loss": 0.0005, + "num_tokens": 2395523442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7984979090210805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011191077933095923, + "kl": 0.01361083984375, + "learning_rate": 2.378844974811628e-06, + "loss": 0.0005, + "num_tokens": 2396094034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7986686011777758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048311485783738997, + "kl": 0.01287841796875, + "learning_rate": 2.3749889482493926e-06, + "loss": 0.0005, + "num_tokens": 2396652914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7988392933344712, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037135553856303956, + "kl": 0.0137481689453125, + "learning_rate": 2.371135628290837e-06, + "loss": 0.0005, + "num_tokens": 2397226482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7990099854911666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005712487889011153, + "kl": 0.01275634765625, + "learning_rate": 2.3672850163037444e-06, + "loss": 0.0005, + "num_tokens": 2397788850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.799180677647862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003007681224118624, + "kl": 0.0131683349609375, + "learning_rate": 2.3634371136549483e-06, + "loss": 0.0005, + "num_tokens": 2398351234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7993513698045575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001438884728500116, + "kl": 0.01336669921875, + "learning_rate": 2.3595919217103104e-06, + "loss": 0.0005, + "num_tokens": 2398918546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7995220619612529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007243794488243902, + "kl": 0.0135650634765625, + "learning_rate": 2.3557494418347373e-06, + "loss": 0.0005, + "num_tokens": 2399486226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7996927541179483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003276191821586875, + "kl": 0.0128936767578125, + "learning_rate": 2.3519096753921732e-06, + "loss": 0.0005, + "num_tokens": 2400052674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7998634462746437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010499642685557994, + "kl": 0.0133514404296875, + "learning_rate": 2.3480726237455943e-06, + "loss": 0.0005, + "num_tokens": 2400620770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8000341384313391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00012542352867694586, + "kl": 0.0134124755859375, + "learning_rate": 2.3442382882570126e-06, + "loss": 0.0005, + "num_tokens": 2401188642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8002048305880345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006915301448971377, + "kl": 0.011749267578125, + "learning_rate": 2.3404066702874816e-06, + "loss": 0.0005, + "num_tokens": 2401754162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8003755227447299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003980080426528409, + "kl": 0.0130767822265625, + "learning_rate": 2.3365777711970895e-06, + "loss": 0.0005, + "num_tokens": 2402320898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8005462149014253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019370083865311488, + "kl": 0.013427734375, + "learning_rate": 2.332751592344955e-06, + "loss": 0.0005, + "num_tokens": 2402890802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8007169070581207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001375791022766364, + "kl": 0.012451171875, + "learning_rate": 2.328928135089231e-06, + "loss": 0.0005, + "num_tokens": 2403461042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8008875992148161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020868915464078297, + "kl": 0.013427734375, + "learning_rate": 2.325107400787109e-06, + "loss": 0.0005, + "num_tokens": 2404032834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8010582913715115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002590533908699543, + "kl": 0.0131683349609375, + "learning_rate": 2.3212893907948154e-06, + "loss": 0.0005, + "num_tokens": 2404594018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8012289835282069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008292797551264266, + "kl": 0.01287841796875, + "learning_rate": 2.317474106467602e-06, + "loss": 0.0005, + "num_tokens": 2405167426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8013996756849022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002458903737824146, + "kl": 0.012939453125, + "learning_rate": 2.313661549159755e-06, + "loss": 0.0005, + "num_tokens": 2405732002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8015703678415976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008919708462799841, + "kl": 0.01312255859375, + "learning_rate": 2.309851720224596e-06, + "loss": 0.0005, + "num_tokens": 2406297410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.801741059998293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007064398807090278, + "kl": 0.0122222900390625, + "learning_rate": 2.3060446210144826e-06, + "loss": 0.0005, + "num_tokens": 2406870242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8019117521549884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014622860493527075, + "kl": 0.0131378173828125, + "learning_rate": 2.302240252880792e-06, + "loss": 0.0005, + "num_tokens": 2407430546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8020824443116839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008948240032818805, + "kl": 0.0126953125, + "learning_rate": 2.2984386171739358e-06, + "loss": 0.0005, + "num_tokens": 2407994194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8022531364683793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036558308478551346, + "kl": 0.0129547119140625, + "learning_rate": 2.2946397152433595e-06, + "loss": 0.0005, + "num_tokens": 2408558146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8024238286250747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018520990227827709, + "kl": 0.0128021240234375, + "learning_rate": 2.29084354843754e-06, + "loss": 0.0005, + "num_tokens": 2409125090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8025945207817701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007151693501203803, + "kl": 0.0136871337890625, + "learning_rate": 2.287050118103976e-06, + "loss": 0.0005, + "num_tokens": 2409697650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8027652129384655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00094246984687569, + "kl": 0.01287841796875, + "learning_rate": 2.283259425589196e-06, + "loss": 0.0005, + "num_tokens": 2410269474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8029359050951609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001714578397913567, + "kl": 0.013153076171875, + "learning_rate": 2.2794714722387623e-06, + "loss": 0.0005, + "num_tokens": 2410833410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8031065972518563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002521730988831911, + "kl": 0.0140228271484375, + "learning_rate": 2.2756862593972596e-06, + "loss": 0.0006, + "num_tokens": 2411401778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8032772894085517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008149308155392787, + "kl": 0.01324462890625, + "learning_rate": 2.2719037884083097e-06, + "loss": 0.0005, + "num_tokens": 2411967842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8034479815652471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010430306301636944, + "kl": 0.012786865234375, + "learning_rate": 2.2681240606145406e-06, + "loss": 0.0005, + "num_tokens": 2412537058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8036186737219425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003787193046767439, + "kl": 0.0124969482421875, + "learning_rate": 2.264347077357625e-06, + "loss": 0.0005, + "num_tokens": 2413105394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8037893658786379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041691578439590724, + "kl": 0.012664794921875, + "learning_rate": 2.260572839978257e-06, + "loss": 0.0005, + "num_tokens": 2413678306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8039600580353333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008959091898877332, + "kl": 0.0130462646484375, + "learning_rate": 2.2568013498161546e-06, + "loss": 0.0005, + "num_tokens": 2414242562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8041307501920287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016935676046986648, + "kl": 0.013153076171875, + "learning_rate": 2.2530326082100608e-06, + "loss": 0.0005, + "num_tokens": 2414802738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.804301442348724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003295832635316909, + "kl": 0.0137481689453125, + "learning_rate": 2.2492666164977384e-06, + "loss": 0.0005, + "num_tokens": 2415370258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8044721345054194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016647414914411204, + "kl": 0.0134429931640625, + "learning_rate": 2.2455033760159817e-06, + "loss": 0.0005, + "num_tokens": 2415946386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8046428266621148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005380995934700655, + "kl": 0.012725830078125, + "learning_rate": 2.2417428881006087e-06, + "loss": 0.0005, + "num_tokens": 2416516386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8048135188188102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000765323236692747, + "kl": 0.0123291015625, + "learning_rate": 2.237985154086453e-06, + "loss": 0.0005, + "num_tokens": 2417091426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8049842109755057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002161563959032647, + "kl": 0.0128021240234375, + "learning_rate": 2.2342301753073726e-06, + "loss": 0.0005, + "num_tokens": 2417659074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8051549031322011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021036530952952642, + "kl": 0.013275146484375, + "learning_rate": 2.2304779530962505e-06, + "loss": 0.0005, + "num_tokens": 2418230818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8053255952888965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007087583698740686, + "kl": 0.0135955810546875, + "learning_rate": 2.2267284887849948e-06, + "loss": 0.0005, + "num_tokens": 2418794786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8054962874455919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007389368724026047, + "kl": 0.0137939453125, + "learning_rate": 2.2229817837045275e-06, + "loss": 0.0006, + "num_tokens": 2419356290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8056669796022873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009609625237645927, + "kl": 0.0133056640625, + "learning_rate": 2.219237839184789e-06, + "loss": 0.0005, + "num_tokens": 2419918930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8058376717589827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007921975690526068, + "kl": 0.013092041015625, + "learning_rate": 2.2154966565547485e-06, + "loss": 0.0005, + "num_tokens": 2420481586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8060083639156781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018911169102419554, + "kl": 0.0130615234375, + "learning_rate": 2.2117582371423928e-06, + "loss": 0.0005, + "num_tokens": 2421044258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8061790560723735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023996597287402702, + "kl": 0.01312255859375, + "learning_rate": 2.208022582274725e-06, + "loss": 0.0005, + "num_tokens": 2421609282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8063497482290689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017079277064695141, + "kl": 0.0133819580078125, + "learning_rate": 2.2042896932777624e-06, + "loss": 0.0005, + "num_tokens": 2422180354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8065204403857643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003171560718842594, + "kl": 0.0130462646484375, + "learning_rate": 2.2005595714765494e-06, + "loss": 0.0005, + "num_tokens": 2422746050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8066911325424597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004422549048162905, + "kl": 0.0132904052734375, + "learning_rate": 2.1968322181951484e-06, + "loss": 0.0005, + "num_tokens": 2423312642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8068618246991551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00079136196214106, + "kl": 0.0133514404296875, + "learning_rate": 2.1931076347566285e-06, + "loss": 0.0005, + "num_tokens": 2423874994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8070325168558504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000540535172327068, + "kl": 0.01373291015625, + "learning_rate": 2.189385822483089e-06, + "loss": 0.0005, + "num_tokens": 2424437026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8072032090125458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047428771251986266, + "kl": 0.0125579833984375, + "learning_rate": 2.1856667826956314e-06, + "loss": 0.0005, + "num_tokens": 2425000610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8073739011692412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014404516714426098, + "kl": 0.0130767822265625, + "learning_rate": 2.1819505167143885e-06, + "loss": 0.0005, + "num_tokens": 2425570130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8075445933259366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013626943118617786, + "kl": 0.013214111328125, + "learning_rate": 2.178237025858494e-06, + "loss": 0.0005, + "num_tokens": 2426139026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.807715285482632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021988756191003572, + "kl": 0.0133056640625, + "learning_rate": 2.174526311446109e-06, + "loss": 0.0005, + "num_tokens": 2426706146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8078859776393275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009256737218123234, + "kl": 0.0131683349609375, + "learning_rate": 2.1708183747943966e-06, + "loss": 0.0005, + "num_tokens": 2427274242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8080566697960229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003073086358267724, + "kl": 0.012969970703125, + "learning_rate": 2.1671132172195462e-06, + "loss": 0.0005, + "num_tokens": 2427847218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8082273619527183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005677393389013643, + "kl": 0.0128173828125, + "learning_rate": 2.1634108400367513e-06, + "loss": 0.0005, + "num_tokens": 2428411474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8083980541094137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009675363935972211, + "kl": 0.0135345458984375, + "learning_rate": 2.1597112445602254e-06, + "loss": 0.0005, + "num_tokens": 2428984626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8085687462661091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009021009869857466, + "kl": 0.013092041015625, + "learning_rate": 2.1560144321031873e-06, + "loss": 0.0005, + "num_tokens": 2429549650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8087394384228045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002758125466453399, + "kl": 0.013336181640625, + "learning_rate": 2.152320403977877e-06, + "loss": 0.0005, + "num_tokens": 2430114434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8089101305794999, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031100365943480203, + "kl": 0.013336181640625, + "learning_rate": 2.1486291614955357e-06, + "loss": 0.0005, + "num_tokens": 2430681650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8090808227361953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00100659794082429, + "kl": 0.01300048828125, + "learning_rate": 2.144940705966425e-06, + "loss": 0.0005, + "num_tokens": 2431242018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8092515148928907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001184847598634978, + "kl": 0.01361083984375, + "learning_rate": 2.141255038699811e-06, + "loss": 0.0005, + "num_tokens": 2431814258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8094222070495861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000161146319388273, + "kl": 0.012481689453125, + "learning_rate": 2.1375721610039766e-06, + "loss": 0.0005, + "num_tokens": 2432391490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8095928992062815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011933171866409646, + "kl": 0.0130462646484375, + "learning_rate": 2.1338920741862044e-06, + "loss": 0.0005, + "num_tokens": 2432961042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8097635913629768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00041844259790576113, + "kl": 0.0131072998046875, + "learning_rate": 2.1302147795527994e-06, + "loss": 0.0005, + "num_tokens": 2433527010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8099342835196722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009165138155681817, + "kl": 0.0131988525390625, + "learning_rate": 2.1265402784090615e-06, + "loss": 0.0005, + "num_tokens": 2434091906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8101049756763676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023103689227187114, + "kl": 0.012481689453125, + "learning_rate": 2.1228685720593113e-06, + "loss": 0.0005, + "num_tokens": 2434663314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.810275667833063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007623087762456951, + "kl": 0.0135040283203125, + "learning_rate": 2.119199661806868e-06, + "loss": 0.0005, + "num_tokens": 2435232050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8104463599897584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001954238370467617, + "kl": 0.012908935546875, + "learning_rate": 2.115533548954064e-06, + "loss": 0.0005, + "num_tokens": 2435797666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8106170521464539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005097988235183122, + "kl": 0.01263427734375, + "learning_rate": 2.111870234802239e-06, + "loss": 0.0005, + "num_tokens": 2436363682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8107877443031493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005742860999851926, + "kl": 0.0129852294921875, + "learning_rate": 2.1082097206517373e-06, + "loss": 0.0005, + "num_tokens": 2436928258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8109584364598447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011584811315122309, + "kl": 0.03399658203125, + "learning_rate": 2.1045520078019033e-06, + "loss": 0.0014, + "num_tokens": 2437519986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8111291286165401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014495198473346378, + "kl": 0.01385498046875, + "learning_rate": 2.100897097551098e-06, + "loss": 0.0006, + "num_tokens": 2438086338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8112998207732355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004276865344517489, + "kl": 0.01348876953125, + "learning_rate": 2.0972449911966843e-06, + "loss": 0.0005, + "num_tokens": 2438655266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8114705129299309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008225793342091041, + "kl": 0.0127105712890625, + "learning_rate": 2.0935956900350252e-06, + "loss": 0.0005, + "num_tokens": 2439221250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8116412050866263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006306999289271936, + "kl": 0.01275634765625, + "learning_rate": 2.089949195361489e-06, + "loss": 0.0005, + "num_tokens": 2439788322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8118118972433217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006068984593504192, + "kl": 0.013427734375, + "learning_rate": 2.086305508470452e-06, + "loss": 0.0005, + "num_tokens": 2440349794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8119825894000171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009064020487243678, + "kl": 0.012939453125, + "learning_rate": 2.0826646306552945e-06, + "loss": 0.0005, + "num_tokens": 2440910210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8121532815567125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007949632928605632, + "kl": 0.013275146484375, + "learning_rate": 2.079026563208394e-06, + "loss": 0.0005, + "num_tokens": 2441480242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8123239737134079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001500408675684473, + "kl": 0.013214111328125, + "learning_rate": 2.0753913074211287e-06, + "loss": 0.0005, + "num_tokens": 2442042946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8124946658701032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001054087596800597, + "kl": 0.0130462646484375, + "learning_rate": 2.0717588645838883e-06, + "loss": 0.0005, + "num_tokens": 2442608098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8126653580267986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005780566849555345, + "kl": 0.013519287109375, + "learning_rate": 2.0681292359860607e-06, + "loss": 0.0005, + "num_tokens": 2443173522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.812836050183494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002432302491699555, + "kl": 0.01397705078125, + "learning_rate": 2.0645024229160305e-06, + "loss": 0.0006, + "num_tokens": 2443738706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8130067423401894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008547266355212962, + "kl": 0.0133819580078125, + "learning_rate": 2.0608784266611816e-06, + "loss": 0.0005, + "num_tokens": 2444306658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8131774344968848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9206551821996357, + "kl": 0.1281280517578125, + "learning_rate": 2.0572572485079056e-06, + "loss": 0.0051, + "num_tokens": 2444876418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8133481266535803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008339468701610781, + "kl": 0.0132598876953125, + "learning_rate": 2.0536388897415905e-06, + "loss": 0.0005, + "num_tokens": 2445436706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8135188188102757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007540261606711157, + "kl": 0.01446533203125, + "learning_rate": 2.0500233516466275e-06, + "loss": 0.0006, + "num_tokens": 2446003186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8136895109669711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000337993371237882, + "kl": 0.0150604248046875, + "learning_rate": 2.0464106355063916e-06, + "loss": 0.0006, + "num_tokens": 2446566162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8138602031236665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031471927536014226, + "kl": 0.016204833984375, + "learning_rate": 2.0428007426032714e-06, + "loss": 0.0006, + "num_tokens": 2447134994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8140308952803619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009029921072684938, + "kl": 0.017242431640625, + "learning_rate": 2.03919367421865e-06, + "loss": 0.0007, + "num_tokens": 2447700962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8142015874370573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002217871522701714, + "kl": 0.018890380859375, + "learning_rate": 2.0355894316329085e-06, + "loss": 0.0008, + "num_tokens": 2448275074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8143722795937527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001191264824499535, + "kl": 0.021392822265625, + "learning_rate": 2.0319880161254192e-06, + "loss": 0.0009, + "num_tokens": 2448842130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8145429717504481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002870198438742425, + "kl": 0.02239990234375, + "learning_rate": 2.028389428974553e-06, + "loss": 0.0009, + "num_tokens": 2449410834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8147136639071435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003929941008229079, + "kl": 0.025848388671875, + "learning_rate": 2.0247936714576812e-06, + "loss": 0.001, + "num_tokens": 2449972354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8148843560638389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007805096974405472, + "kl": 0.027099609375, + "learning_rate": 2.0212007448511694e-06, + "loss": 0.0011, + "num_tokens": 2450544882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8150550482205343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0069164132577610345, + "kl": 0.0323486328125, + "learning_rate": 2.017610650430376e-06, + "loss": 0.0013, + "num_tokens": 2451117266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8152257403772296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00847864169433739, + "kl": 0.034423828125, + "learning_rate": 2.0140233894696515e-06, + "loss": 0.0014, + "num_tokens": 2451686146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.815396432533925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011717064749703408, + "kl": 0.0377197265625, + "learning_rate": 2.010438963242346e-06, + "loss": 0.0015, + "num_tokens": 2452248562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8155671246906204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012616361650688414, + "kl": 0.03912353515625, + "learning_rate": 2.006857373020804e-06, + "loss": 0.0016, + "num_tokens": 2452817794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8157378168473158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022983252406554383, + "kl": 0.0406494140625, + "learning_rate": 2.00327862007636e-06, + "loss": 0.0016, + "num_tokens": 2453381266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8159085090040112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24870855463603742, + "kl": 0.07757568359375, + "learning_rate": 1.9997027056793374e-06, + "loss": 0.0031, + "num_tokens": 2453949650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8160792011607066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007977877895590294, + "kl": 0.03948974609375, + "learning_rate": 1.9961296310990608e-06, + "loss": 0.0016, + "num_tokens": 2454520098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.816249893317402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012870328736872478, + "kl": 0.03857421875, + "learning_rate": 1.9925593976038436e-06, + "loss": 0.0015, + "num_tokens": 2455083570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8164205854740975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007397762918938783, + "kl": 0.0379638671875, + "learning_rate": 1.988992006460989e-06, + "loss": 0.0015, + "num_tokens": 2455648498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8165912776307929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028789595283272636, + "kl": 0.04571533203125, + "learning_rate": 1.9854274589367883e-06, + "loss": 0.0018, + "num_tokens": 2456213426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8167619697874883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012226150594760237, + "kl": 0.040283203125, + "learning_rate": 1.9818657562965305e-06, + "loss": 0.0016, + "num_tokens": 2456778354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8169326619441837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012245734492732792, + "kl": 0.03515625, + "learning_rate": 1.978306899804494e-06, + "loss": 0.0014, + "num_tokens": 2457342178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8171033541008791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011355286871683646, + "kl": 0.03460693359375, + "learning_rate": 1.974750890723941e-06, + "loss": 0.0014, + "num_tokens": 2457902722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8172740462575745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003958014503625289, + "kl": 0.032745361328125, + "learning_rate": 1.9711977303171248e-06, + "loss": 0.0013, + "num_tokens": 2458469122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8174447384142699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05707101400281352, + "kl": 0.03717041015625, + "learning_rate": 1.967647419845291e-06, + "loss": 0.0015, + "num_tokens": 2459038434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8176154305709653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010227453530612485, + "kl": 0.030303955078125, + "learning_rate": 1.964099960568675e-06, + "loss": 0.0012, + "num_tokens": 2459607858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8177861227276607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007279915990076554, + "kl": 0.0328369140625, + "learning_rate": 1.9605553537464906e-06, + "loss": 0.0013, + "num_tokens": 2460174306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.817956814884356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02045796095371433, + "kl": 0.0361328125, + "learning_rate": 1.9570136006369513e-06, + "loss": 0.0014, + "num_tokens": 2460739874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8181275070410514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010574185718020112, + "kl": 0.03204345703125, + "learning_rate": 1.9534747024972455e-06, + "loss": 0.0013, + "num_tokens": 2461304562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8182981991977468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00570249526209667, + "kl": 0.03387451171875, + "learning_rate": 1.94993866058356e-06, + "loss": 0.0014, + "num_tokens": 2461869426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8184688913544422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005296860187115218, + "kl": 0.03173828125, + "learning_rate": 1.9464054761510574e-06, + "loss": 0.0013, + "num_tokens": 2462438994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8186395835111376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007795051253584432, + "kl": 0.0322265625, + "learning_rate": 1.9428751504538957e-06, + "loss": 0.0013, + "num_tokens": 2463004082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.818810275667833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012811531225870013, + "kl": 0.03558349609375, + "learning_rate": 1.9393476847452064e-06, + "loss": 0.0014, + "num_tokens": 2463571362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8189809678245284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26734591826435294, + "kl": 0.0614013671875, + "learning_rate": 1.9358230802771196e-06, + "loss": 0.0025, + "num_tokens": 2464133602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8191516599812239, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.422233060128108, + "kl": 0.2713623046875, + "learning_rate": 1.932301338300736e-06, + "loss": 0.0109, + "num_tokens": 2464697522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8193223521379193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00899635855315293, + "kl": 0.03424072265625, + "learning_rate": 1.928782460066152e-06, + "loss": 0.0014, + "num_tokens": 2465268594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8194930442946147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01353138076176201, + "kl": 0.03741455078125, + "learning_rate": 1.925266446822439e-06, + "loss": 0.0015, + "num_tokens": 2465831874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8196637364513101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011514941799075603, + "kl": 0.03643798828125, + "learning_rate": 1.921753299817659e-06, + "loss": 0.0015, + "num_tokens": 2466394466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8198344286080055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012779391507330443, + "kl": 0.0386962890625, + "learning_rate": 1.9182430202988467e-06, + "loss": 0.0015, + "num_tokens": 2466960354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8200051207647009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01658996271343943, + "kl": 0.0377197265625, + "learning_rate": 1.9147356095120297e-06, + "loss": 0.0015, + "num_tokens": 2467522626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8201758129213963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010222287682491957, + "kl": 0.0350341796875, + "learning_rate": 1.9112310687022086e-06, + "loss": 0.0014, + "num_tokens": 2468084962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8203465050780917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016930911026048615, + "kl": 0.03948974609375, + "learning_rate": 1.9077293991133715e-06, + "loss": 0.0016, + "num_tokens": 2468649730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8205171972347871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011694479378526414, + "kl": 0.03704833984375, + "learning_rate": 1.9042306019884816e-06, + "loss": 0.0015, + "num_tokens": 2469213890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8206878893914825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00914193680459309, + "kl": 0.0379638671875, + "learning_rate": 1.9007346785694868e-06, + "loss": 0.0015, + "num_tokens": 2469777794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8208585815481778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4137452079986824, + "kl": 0.095947265625, + "learning_rate": 1.8972416300973207e-06, + "loss": 0.0038, + "num_tokens": 2470339890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8210292737048732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08142600400148343, + "kl": 0.04229736328125, + "learning_rate": 1.8937514578118777e-06, + "loss": 0.0017, + "num_tokens": 2470908098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8211999658615686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010795363677017748, + "kl": 0.035064697265625, + "learning_rate": 1.890264162952048e-06, + "loss": 0.0014, + "num_tokens": 2471489426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.821370658018264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01823653769323962, + "kl": 0.05682373046875, + "learning_rate": 1.8867797467556958e-06, + "loss": 0.0023, + "num_tokens": 2472050338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8215413501749594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01917411078261083, + "kl": 0.05633544921875, + "learning_rate": 1.8832982104596665e-06, + "loss": 0.0023, + "num_tokens": 2472612674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8217120423316548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011203564905140037, + "kl": 0.035400390625, + "learning_rate": 1.8798195552997756e-06, + "loss": 0.0014, + "num_tokens": 2473177810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8218827344883503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013058123898718374, + "kl": 0.0386962890625, + "learning_rate": 1.8763437825108178e-06, + "loss": 0.0015, + "num_tokens": 2473738962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8220534266450457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01929456964321118, + "kl": 0.0618896484375, + "learning_rate": 1.8728708933265715e-06, + "loss": 0.0025, + "num_tokens": 2474299266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8222241188017411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007585437017112176, + "kl": 0.0347900390625, + "learning_rate": 1.8694008889797866e-06, + "loss": 0.0014, + "num_tokens": 2474863378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8223948109584365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01598454335323165, + "kl": 0.056396484375, + "learning_rate": 1.8659337707021896e-06, + "loss": 0.0023, + "num_tokens": 2475429266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8225655031151319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020641619549641714, + "kl": 0.0550537109375, + "learning_rate": 1.8624695397244773e-06, + "loss": 0.0022, + "num_tokens": 2475990290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8227361952718273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016438959051829075, + "kl": 0.0560302734375, + "learning_rate": 1.8590081972763319e-06, + "loss": 0.0022, + "num_tokens": 2476556946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8229068874285227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025887459503634178, + "kl": 0.071197509765625, + "learning_rate": 1.8555497445864046e-06, + "loss": 0.0029, + "num_tokens": 2477124466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8230775795852181, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0079205744604781, + "kl": 0.030181884765625, + "learning_rate": 1.8520941828823213e-06, + "loss": 0.0012, + "num_tokens": 2477686386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8232482717419135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014699670433537567, + "kl": 0.05224609375, + "learning_rate": 1.8486415133906777e-06, + "loss": 0.0021, + "num_tokens": 2478249042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8234189638986089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015143911094037735, + "kl": 0.040069580078125, + "learning_rate": 1.8451917373370498e-06, + "loss": 0.0016, + "num_tokens": 2478810738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8235896560553042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005669132333576786, + "kl": 0.027740478515625, + "learning_rate": 1.8417448559459849e-06, + "loss": 0.0011, + "num_tokens": 2479381106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8237603482119996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010257234091258482, + "kl": 0.031158447265625, + "learning_rate": 1.838300870441001e-06, + "loss": 0.0012, + "num_tokens": 2479947410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.823931040368695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005042538572511986, + "kl": 0.024200439453125, + "learning_rate": 1.8348597820445857e-06, + "loss": 0.001, + "num_tokens": 2480515378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8241017325253904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016082242674193368, + "kl": 0.051788330078125, + "learning_rate": 1.831421591978203e-06, + "loss": 0.0021, + "num_tokens": 2481084322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8242724246820858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0050871935531023975, + "kl": 0.024322509765625, + "learning_rate": 1.8279863014622878e-06, + "loss": 0.001, + "num_tokens": 2481652066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8244431168387812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026003206300213378, + "kl": 0.02410888671875, + "learning_rate": 1.8245539117162414e-06, + "loss": 0.001, + "num_tokens": 2482215618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8246138089954766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004524506480499947, + "kl": 0.02081298828125, + "learning_rate": 1.8211244239584413e-06, + "loss": 0.0008, + "num_tokens": 2482782594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8247845011521721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002288962570946133, + "kl": 0.02020263671875, + "learning_rate": 1.8176978394062284e-06, + "loss": 0.0008, + "num_tokens": 2483346882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8249551933088675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003242303986051462, + "kl": 0.021087646484375, + "learning_rate": 1.8142741592759205e-06, + "loss": 0.0008, + "num_tokens": 2483915218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8251258854655629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01327123421659326, + "kl": 0.0516357421875, + "learning_rate": 1.8108533847827959e-06, + "loss": 0.0021, + "num_tokens": 2484478818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8252965776222583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005729290573612297, + "kl": 0.024444580078125, + "learning_rate": 1.8074355171411117e-06, + "loss": 0.001, + "num_tokens": 2485043218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8254672697789537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025905195114478523, + "kl": 0.019378662109375, + "learning_rate": 1.8040205575640813e-06, + "loss": 0.0008, + "num_tokens": 2485609826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8256379619356491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027986644827778786, + "kl": 0.019134521484375, + "learning_rate": 1.8006085072638967e-06, + "loss": 0.0008, + "num_tokens": 2486174562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8258086540923445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20138045076826683, + "kl": 0.070709228515625, + "learning_rate": 1.7971993674517096e-06, + "loss": 0.0028, + "num_tokens": 2486740354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8259793462490399, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025992497379822887, + "kl": 0.01898193359375, + "learning_rate": 1.793793139337645e-06, + "loss": 0.0008, + "num_tokens": 2487305426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8261500384057353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026615273248610937, + "kl": 0.019134521484375, + "learning_rate": 1.7903898241307872e-06, + "loss": 0.0008, + "num_tokens": 2487873058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8263207305624306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014561704515570453, + "kl": 0.019622802734375, + "learning_rate": 1.7869894230391938e-06, + "loss": 0.0008, + "num_tokens": 2488440402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.826491422719126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013025164452828837, + "kl": 0.032135009765625, + "learning_rate": 1.7835919372698796e-06, + "loss": 0.0013, + "num_tokens": 2489003122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8266621148758214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024927354121192804, + "kl": 0.02081298828125, + "learning_rate": 1.7801973680288354e-06, + "loss": 0.0008, + "num_tokens": 2489568610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8268328070325168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015267867163563911, + "kl": 0.018341064453125, + "learning_rate": 1.7768057165210052e-06, + "loss": 0.0007, + "num_tokens": 2490140178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8270034991892122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023536953678711717, + "kl": 0.0196533203125, + "learning_rate": 1.773416983950308e-06, + "loss": 0.0008, + "num_tokens": 2490704946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8271741913459076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014735809309902717, + "kl": 0.036651611328125, + "learning_rate": 1.770031171519616e-06, + "loss": 0.0015, + "num_tokens": 2491271698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.827344883502603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022898707915000945, + "kl": 0.021514892578125, + "learning_rate": 1.7666482804307761e-06, + "loss": 0.0009, + "num_tokens": 2491838178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8275155756592985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028673570318553483, + "kl": 0.020721435546875, + "learning_rate": 1.7632683118845873e-06, + "loss": 0.0008, + "num_tokens": 2492404770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8276862678159939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015185981750882984, + "kl": 0.019195556640625, + "learning_rate": 1.759891267080821e-06, + "loss": 0.0008, + "num_tokens": 2492969986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8278569599726893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0044835397733218995, + "kl": 0.020751953125, + "learning_rate": 1.7565171472182009e-06, + "loss": 0.0008, + "num_tokens": 2493535938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8280276521293847, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018420295763210773, + "kl": 0.01812744140625, + "learning_rate": 1.7531459534944217e-06, + "loss": 0.0007, + "num_tokens": 2494101090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8281983442860801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013799299736922215, + "kl": 0.019439697265625, + "learning_rate": 1.749777687106138e-06, + "loss": 0.0008, + "num_tokens": 2494668466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8283690364427755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019329321103189645, + "kl": 0.01953125, + "learning_rate": 1.7464123492489582e-06, + "loss": 0.0008, + "num_tokens": 2495235138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8285397285994709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02421059169453219, + "kl": 0.0408935546875, + "learning_rate": 1.743049941117455e-06, + "loss": 0.0016, + "num_tokens": 2495800386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8287104207561663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00925739734380982, + "kl": 0.023590087890625, + "learning_rate": 1.7396904639051636e-06, + "loss": 0.0009, + "num_tokens": 2496364578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8288811129128617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026404254226540125, + "kl": 0.01812744140625, + "learning_rate": 1.7363339188045803e-06, + "loss": 0.0007, + "num_tokens": 2496929218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.829051805069557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015427007910208893, + "kl": 0.019073486328125, + "learning_rate": 1.7329803070071549e-06, + "loss": 0.0008, + "num_tokens": 2497497602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8292224972262524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019723927273040686, + "kl": 0.04656982421875, + "learning_rate": 1.7296296297032944e-06, + "loss": 0.0019, + "num_tokens": 2498060050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8293931893829478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008192214442825469, + "kl": 0.02154541015625, + "learning_rate": 1.726281888082374e-06, + "loss": 0.0009, + "num_tokens": 2498625874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8295638815396432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025795627586627868, + "kl": 0.019073486328125, + "learning_rate": 1.722937083332721e-06, + "loss": 0.0008, + "num_tokens": 2499188946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8297345736963386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011295516553643374, + "kl": 0.018524169921875, + "learning_rate": 1.7195952166416175e-06, + "loss": 0.0007, + "num_tokens": 2499757410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.829905265853034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017550181016006282, + "kl": 0.01788330078125, + "learning_rate": 1.716256289195305e-06, + "loss": 0.0007, + "num_tokens": 2500321426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8300759580097294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001875679878275624, + "kl": 0.017486572265625, + "learning_rate": 1.712920302178984e-06, + "loss": 0.0007, + "num_tokens": 2500890226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8302466501664248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008911621121908029, + "kl": 0.01800537109375, + "learning_rate": 1.7095872567768123e-06, + "loss": 0.0007, + "num_tokens": 2501452338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8304173423231203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002113597281597653, + "kl": 0.018096923828125, + "learning_rate": 1.7062571541718976e-06, + "loss": 0.0007, + "num_tokens": 2502015922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8305880344798157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006763348419760385, + "kl": 0.029937744140625, + "learning_rate": 1.7029299955463031e-06, + "loss": 0.0012, + "num_tokens": 2502611314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8307587266365111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011353227417952238, + "kl": 0.018280029296875, + "learning_rate": 1.699605782081053e-06, + "loss": 0.0007, + "num_tokens": 2503178834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8309294187932065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014343571724919589, + "kl": 0.031341552734375, + "learning_rate": 1.6962845149561235e-06, + "loss": 0.0013, + "num_tokens": 2503746130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8311001109499019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007798922776764916, + "kl": 0.01751708984375, + "learning_rate": 1.6929661953504495e-06, + "loss": 0.0007, + "num_tokens": 2504309538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8312708031065973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015145089449354653, + "kl": 0.017547607421875, + "learning_rate": 1.689650824441904e-06, + "loss": 0.0007, + "num_tokens": 2504874594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8314414952632927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017720624828729682, + "kl": 0.018951416015625, + "learning_rate": 1.6863384034073304e-06, + "loss": 0.0008, + "num_tokens": 2505442034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8316121874199881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02212302930913734, + "kl": 0.02105712890625, + "learning_rate": 1.683028933422517e-06, + "loss": 0.0008, + "num_tokens": 2506010306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8317828795766834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00126169711488284, + "kl": 0.01788330078125, + "learning_rate": 1.6797224156622094e-06, + "loss": 0.0007, + "num_tokens": 2506570258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8319535717333788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012909525796796536, + "kl": 0.018096923828125, + "learning_rate": 1.6764188513001e-06, + "loss": 0.0007, + "num_tokens": 2507137666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8321242638900742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002405837624655413, + "kl": 0.01788330078125, + "learning_rate": 1.673118241508831e-06, + "loss": 0.0007, + "num_tokens": 2507707666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8322949560467696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018453777527701171, + "kl": 0.01953125, + "learning_rate": 1.6698205874600038e-06, + "loss": 0.0008, + "num_tokens": 2508274082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.832465648203465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01570754332176762, + "kl": 0.0355224609375, + "learning_rate": 1.6665258903241677e-06, + "loss": 0.0014, + "num_tokens": 2508840722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8326363403601604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001374731896183571, + "kl": 0.017852783203125, + "learning_rate": 1.6632341512708194e-06, + "loss": 0.0007, + "num_tokens": 2509409874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8328070325168558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001819915470270191, + "kl": 0.01837158203125, + "learning_rate": 1.6599453714684055e-06, + "loss": 0.0007, + "num_tokens": 2509975202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8329777246735512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.217163445117371, + "kl": 0.044158935546875, + "learning_rate": 1.6566595520843253e-06, + "loss": 0.0018, + "num_tokens": 2510539666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8331484168302467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002106023585858942, + "kl": 0.019439697265625, + "learning_rate": 1.6533766942849295e-06, + "loss": 0.0008, + "num_tokens": 2511106242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8333191089869421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014335409426392748, + "kl": 0.024139404296875, + "learning_rate": 1.6500967992355122e-06, + "loss": 0.001, + "num_tokens": 2511675442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8334898011436375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05288584719698598, + "kl": 0.08740234375, + "learning_rate": 1.6468198681003134e-06, + "loss": 0.0035, + "num_tokens": 2512239746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8336604933003329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01881820266700789, + "kl": 0.04443359375, + "learning_rate": 1.6435459020425305e-06, + "loss": 0.0018, + "num_tokens": 2512806098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8338311854570283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01961615525166466, + "kl": 0.0223388671875, + "learning_rate": 1.6402749022243026e-06, + "loss": 0.0009, + "num_tokens": 2513376338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8340018776137237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02058700372143429, + "kl": 0.048828125, + "learning_rate": 1.6370068698067165e-06, + "loss": 0.002, + "num_tokens": 2513957970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8341725697704191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011824495477747148, + "kl": 0.046356201171875, + "learning_rate": 1.6337418059498022e-06, + "loss": 0.0019, + "num_tokens": 2514519682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8343432619271145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010548476746873137, + "kl": 0.03045654296875, + "learning_rate": 1.6304797118125414e-06, + "loss": 0.0012, + "num_tokens": 2515088530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8345139540838098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004626965293366384, + "kl": 0.02789306640625, + "learning_rate": 1.6272205885528648e-06, + "loss": 0.0011, + "num_tokens": 2515653826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8346846462405052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054242301924007884, + "kl": 0.10894775390625, + "learning_rate": 1.6239644373276375e-06, + "loss": 0.0044, + "num_tokens": 2516216786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8348553383972006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02717966304215192, + "kl": 0.05999755859375, + "learning_rate": 1.6207112592926766e-06, + "loss": 0.0024, + "num_tokens": 2516778578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.835026030553896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01162847293185331, + "kl": 0.0323486328125, + "learning_rate": 1.6174610556027426e-06, + "loss": 0.0013, + "num_tokens": 2517344130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8351967227105914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02246994042367005, + "kl": 0.051513671875, + "learning_rate": 1.6142138274115437e-06, + "loss": 0.0021, + "num_tokens": 2517912482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8353674148672868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016688393696023148, + "kl": 0.0406494140625, + "learning_rate": 1.6109695758717248e-06, + "loss": 0.0016, + "num_tokens": 2518473730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8355381070239822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026830961022986682, + "kl": 0.1331787109375, + "learning_rate": 1.6077283021348822e-06, + "loss": 0.0053, + "num_tokens": 2519034722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8357087991806776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006121111507982295, + "kl": 0.0738525390625, + "learning_rate": 1.6044900073515467e-06, + "loss": 0.003, + "num_tokens": 2519598226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.835879491337373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005055772985161858, + "kl": 0.02984619140625, + "learning_rate": 1.6012546926712003e-06, + "loss": 0.0012, + "num_tokens": 2520172354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8360501834940685, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.467652781748448, + "kl": 0.3699951171875, + "learning_rate": 1.5980223592422583e-06, + "loss": 0.0148, + "num_tokens": 2520735458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8362208756507639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11692364450819902, + "kl": 0.20361328125, + "learning_rate": 1.5947930082120867e-06, + "loss": 0.0081, + "num_tokens": 2521302642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8363915678074593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21817678664172613, + "kl": 0.59375, + "learning_rate": 1.5915666407269847e-06, + "loss": 0.0237, + "num_tokens": 2521866098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8365622599641547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.277314391805386, + "kl": 1.169921875, + "learning_rate": 1.5883432579322e-06, + "loss": 0.0468, + "num_tokens": 2522431074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8367329521208501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7276684431915892, + "kl": 1.705078125, + "learning_rate": 1.5851228609719138e-06, + "loss": 0.0683, + "num_tokens": 2522991474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8369036442775455, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4001103362389777, + "kl": 1.361328125, + "learning_rate": 1.581905450989254e-06, + "loss": 0.0545, + "num_tokens": 2523559586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8370743364342409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8549064635316374, + "kl": 1.892578125, + "learning_rate": 1.5786910291262814e-06, + "loss": 0.0758, + "num_tokens": 2524129458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8372450285909362, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1118109398027283, + "kl": 1.841796875, + "learning_rate": 1.5754795965240043e-06, + "loss": 0.0737, + "num_tokens": 2524697538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 2041.12109375, + "completions/mean_terminated_length": 287.0, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.8374157207476316, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.425260523122968, + "kl": 1.83203125, + "learning_rate": 1.5722711543223601e-06, + "loss": 0.0732, + "num_tokens": 2525258721.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.837586412904327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6552384102969891, + "kl": 1.5859375, + "learning_rate": 1.5690657036602352e-06, + "loss": 0.0634, + "num_tokens": 2525821777.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8377571050610224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.505854533509478, + "kl": 1.6875, + "learning_rate": 1.565863245675443e-06, + "loss": 0.0675, + "num_tokens": 2526387105.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8379277972177178, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.07438958028976, + "kl": 1.458984375, + "learning_rate": 1.5626637815047475e-06, + "loss": 0.0584, + "num_tokens": 2526955169.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8380984893744132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7565216691318074, + "kl": 1.177734375, + "learning_rate": 1.5594673122838355e-06, + "loss": 0.0471, + "num_tokens": 2527522977.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8382691815311086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4868945525885192, + "kl": 1.1611328125, + "learning_rate": 1.556273839147342e-06, + "loss": 0.0464, + "num_tokens": 2528088833.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.838439873687804, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.321953370247038, + "kl": 1.2109375, + "learning_rate": 1.5530833632288377e-06, + "loss": 0.0484, + "num_tokens": 2528652321.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8386105658444994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3459685182202044, + "kl": 1.1171875, + "learning_rate": 1.5498958856608227e-06, + "loss": 0.0446, + "num_tokens": 2529215233.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8387812580011949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7208931976858355, + "kl": 1.1201171875, + "learning_rate": 1.5467114075747347e-06, + "loss": 0.0448, + "num_tokens": 2529776129.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8389519501578903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9006120302737531, + "kl": 1.0458984375, + "learning_rate": 1.543529930100952e-06, + "loss": 0.0418, + "num_tokens": 2530341057.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8391226423145857, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.712359780330485, + "kl": 1.568359375, + "learning_rate": 1.5403514543687837e-06, + "loss": 0.0627, + "num_tokens": 2530908113.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8392933344712811, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.5335776590185657, + "kl": 1.0751953125, + "learning_rate": 1.5371759815064745e-06, + "loss": 0.0431, + "num_tokens": 2531469761.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8394640266279765, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6307391197353918, + "kl": 0.9130859375, + "learning_rate": 1.534003512641199e-06, + "loss": 0.0365, + "num_tokens": 2532040497.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8396347187846719, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5474161018028212, + "kl": 0.9326171875, + "learning_rate": 1.5308340488990713e-06, + "loss": 0.0373, + "num_tokens": 2532604417.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8398054109413673, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7849068529122083, + "kl": 1.40234375, + "learning_rate": 1.5276675914051387e-06, + "loss": 0.056, + "num_tokens": 2533166161.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8399761030980627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9800666440760193, + "kl": 1.0830078125, + "learning_rate": 1.5245041412833783e-06, + "loss": 0.0433, + "num_tokens": 2533727761.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.840146795254758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7215213083641592, + "kl": 0.8759765625, + "learning_rate": 1.5213436996566976e-06, + "loss": 0.0351, + "num_tokens": 2534288721.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8403174874114534, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5890980977333509, + "kl": 0.81005859375, + "learning_rate": 1.5181862676469404e-06, + "loss": 0.0324, + "num_tokens": 2534853233.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8404881795681488, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.347826180244919, + "kl": 0.8115234375, + "learning_rate": 1.5150318463748858e-06, + "loss": 0.0324, + "num_tokens": 2535416977.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8406588717248442, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3518449845540275, + "kl": 0.7529296875, + "learning_rate": 1.5118804369602358e-06, + "loss": 0.0301, + "num_tokens": 2535988865.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8408295638815396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9356567523658694, + "kl": 0.7822265625, + "learning_rate": 1.5087320405216256e-06, + "loss": 0.0313, + "num_tokens": 2536552273.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.841000256038235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5358977554032298, + "kl": 0.919921875, + "learning_rate": 1.5055866581766232e-06, + "loss": 0.0369, + "num_tokens": 2537117281.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8411709481949304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31003008045998603, + "kl": 0.818359375, + "learning_rate": 1.5024442910417257e-06, + "loss": 0.0328, + "num_tokens": 2537681745.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8413416403516258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7926117838508475, + "kl": 0.9326171875, + "learning_rate": 1.4993049402323656e-06, + "loss": 0.0373, + "num_tokens": 2538256561.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8415123325083212, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0969688699657751, + "kl": 0.93359375, + "learning_rate": 1.49616860686289e-06, + "loss": 0.0373, + "num_tokens": 2538825233.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8416830246650167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7532339080641699, + "kl": 0.8681640625, + "learning_rate": 1.4930352920465886e-06, + "loss": 0.0348, + "num_tokens": 2539391505.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8418537168217121, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1651698200047023, + "kl": 0.7431640625, + "learning_rate": 1.4899049968956747e-06, + "loss": 0.0297, + "num_tokens": 2539958897.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8420244089784075, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.373871523073545, + "kl": 0.9970703125, + "learning_rate": 1.4867777225212943e-06, + "loss": 0.0398, + "num_tokens": 2540523857.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8421951011351029, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0460821385367878, + "kl": 0.7822265625, + "learning_rate": 1.483653470033508e-06, + "loss": 0.0313, + "num_tokens": 2541087969.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8423657932917983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5597881975878669, + "kl": 0.7978515625, + "learning_rate": 1.4805322405413181e-06, + "loss": 0.0319, + "num_tokens": 2541658769.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8425364854484937, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.322132986895069, + "kl": 0.9775390625, + "learning_rate": 1.4774140351526468e-06, + "loss": 0.039, + "num_tokens": 2542225649.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8427071776051891, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.800806108346697, + "kl": 0.7919921875, + "learning_rate": 1.4742988549743476e-06, + "loss": 0.0317, + "num_tokens": 2542788913.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8428778697618844, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0347204612092415, + "kl": 0.7265625, + "learning_rate": 1.471186701112195e-06, + "loss": 0.029, + "num_tokens": 2543357361.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8430485619185798, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.089428494070183, + "kl": 1.29296875, + "learning_rate": 1.4680775746708898e-06, + "loss": 0.0518, + "num_tokens": 2543918545.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8432192540752752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7640619233726184, + "kl": 0.970703125, + "learning_rate": 1.464971476754059e-06, + "loss": 0.0388, + "num_tokens": 2544481505.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8433899462319706, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2770743375242286, + "kl": 1.044921875, + "learning_rate": 1.461868408464261e-06, + "loss": 0.0419, + "num_tokens": 2545047633.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.843560638388666, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.373373970363318, + "kl": 1.53515625, + "learning_rate": 1.4587683709029688e-06, + "loss": 0.0614, + "num_tokens": 2545610497.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8437313305453614, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.291241510566801, + "kl": 1.60546875, + "learning_rate": 1.455671365170581e-06, + "loss": 0.0642, + "num_tokens": 2546172577.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8439020227020568, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7370127936466975, + "kl": 1.119140625, + "learning_rate": 1.4525773923664267e-06, + "loss": 0.0448, + "num_tokens": 2546739441.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8440727148587522, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8045482905303383, + "kl": 1.130859375, + "learning_rate": 1.4494864535887564e-06, + "loss": 0.0453, + "num_tokens": 2547302273.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8442434070154476, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7138668563983575, + "kl": 1.1044921875, + "learning_rate": 1.446398549934739e-06, + "loss": 0.0442, + "num_tokens": 2547869345.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.844414099172143, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.445676319495967, + "kl": 1.51171875, + "learning_rate": 1.4433136825004668e-06, + "loss": 0.0605, + "num_tokens": 2548431089.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8445847913288385, + "frac_reward_zero_std": 1.0, + "grad_norm": 28.152926588245588, + "kl": 3.53515625, + "learning_rate": 1.440231852380959e-06, + "loss": 0.1413, + "num_tokens": 2548996193.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8447554834855339, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.337627284544324, + "kl": 1.0654296875, + "learning_rate": 1.4371530606701544e-06, + "loss": 0.0426, + "num_tokens": 2549561777.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8449261756422293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47239986073581575, + "kl": 0.9716796875, + "learning_rate": 1.4340773084609138e-06, + "loss": 0.0388, + "num_tokens": 2550138017.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8450968677989247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.370356077592517, + "kl": 1.03125, + "learning_rate": 1.4310045968450148e-06, + "loss": 0.0412, + "num_tokens": 2550713025.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8452675599556201, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.205784311444074, + "kl": 1.3046875, + "learning_rate": 1.4279349269131593e-06, + "loss": 0.0522, + "num_tokens": 2551278305.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8454382521123155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8548491045377903, + "kl": 1.244140625, + "learning_rate": 1.4248682997549744e-06, + "loss": 0.0498, + "num_tokens": 2551842753.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8456089442690108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.60949266013391, + "kl": 0.92578125, + "learning_rate": 1.4218047164589966e-06, + "loss": 0.037, + "num_tokens": 2552409969.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8457796364257062, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.291537937725266, + "kl": 0.95703125, + "learning_rate": 1.418744178112691e-06, + "loss": 0.0383, + "num_tokens": 2552979313.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8459503285824016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42693235835818827, + "kl": 1.0419921875, + "learning_rate": 1.4156866858024343e-06, + "loss": 0.0417, + "num_tokens": 2553543057.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.846121020739097, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1367990674631925, + "kl": 1.076171875, + "learning_rate": 1.4126322406135306e-06, + "loss": 0.0431, + "num_tokens": 2554101585.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8462917128957924, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5134909995368482, + "kl": 0.9443359375, + "learning_rate": 1.4095808436301928e-06, + "loss": 0.0378, + "num_tokens": 2554663777.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8464624050524878, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9371281917504595, + "kl": 1.1171875, + "learning_rate": 1.406532495935562e-06, + "loss": 0.0447, + "num_tokens": 2555225073.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8466330972091832, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.1838062701035144, + "kl": 1.119140625, + "learning_rate": 1.403487198611686e-06, + "loss": 0.0447, + "num_tokens": 2555793553.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8468037893658786, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7453303279257473, + "kl": 0.9296875, + "learning_rate": 1.4004449527395415e-06, + "loss": 0.0371, + "num_tokens": 2556361953.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.846974481522574, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.3653140749231594, + "kl": 1.2783203125, + "learning_rate": 1.3974057593990086e-06, + "loss": 0.0512, + "num_tokens": 2556925329.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8471451736792694, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.728420553118516, + "kl": 0.9765625, + "learning_rate": 1.3943696196688993e-06, + "loss": 0.0391, + "num_tokens": 2557493489.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8473158658359649, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3702007869405521, + "kl": 0.98828125, + "learning_rate": 1.3913365346269269e-06, + "loss": 0.0395, + "num_tokens": 2558058689.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8474865579926603, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8673729794768605, + "kl": 1.125, + "learning_rate": 1.3883065053497313e-06, + "loss": 0.045, + "num_tokens": 2558624257.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8476572501493557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7682448327994316, + "kl": 0.6572265625, + "learning_rate": 1.385279532912861e-06, + "loss": 0.0263, + "num_tokens": 2559195841.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8478279423060511, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4361841970379143, + "kl": 0.890625, + "learning_rate": 1.3822556183907842e-06, + "loss": 0.0357, + "num_tokens": 2559764641.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8479986344627465, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.878150683634893, + "kl": 1.130859375, + "learning_rate": 1.379234762856878e-06, + "loss": 0.0452, + "num_tokens": 2560329441.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8481693266194419, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0416222829631787, + "kl": 0.875, + "learning_rate": 1.3762169673834413e-06, + "loss": 0.035, + "num_tokens": 2560894657.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8483400187761372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5355866451575709, + "kl": 0.890625, + "learning_rate": 1.3732022330416795e-06, + "loss": 0.0356, + "num_tokens": 2561461585.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8485107109328326, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6501912715193143, + "kl": 0.8994140625, + "learning_rate": 1.370190560901714e-06, + "loss": 0.036, + "num_tokens": 2562020529.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.848681403089528, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.824713400690129, + "kl": 0.9951171875, + "learning_rate": 1.3671819520325847e-06, + "loss": 0.0398, + "num_tokens": 2562587009.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8488520952462234, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.673118512625456, + "kl": 1.205078125, + "learning_rate": 1.3641764075022345e-06, + "loss": 0.0482, + "num_tokens": 2563162433.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8490227874029188, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3630719909751456, + "kl": 1.2421875, + "learning_rate": 1.3611739283775227e-06, + "loss": 0.0497, + "num_tokens": 2563726833.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8491934795596142, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.0252598275951272, + "kl": 1.513671875, + "learning_rate": 1.358174515724222e-06, + "loss": 0.0606, + "num_tokens": 2564287089.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8493641717163096, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.500013139531357, + "kl": 1.29296875, + "learning_rate": 1.355178170607019e-06, + "loss": 0.0517, + "num_tokens": 2564852161.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.849534863873005, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.134241278769133, + "kl": 1.056640625, + "learning_rate": 1.3521848940895043e-06, + "loss": 0.0422, + "num_tokens": 2565414833.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8497055560297004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7546457633010513, + "kl": 0.90234375, + "learning_rate": 1.3491946872341833e-06, + "loss": 0.0361, + "num_tokens": 2565984129.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8498762481863958, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.999409067262646, + "kl": 1.1357421875, + "learning_rate": 1.3462075511024708e-06, + "loss": 0.0454, + "num_tokens": 2566548417.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8500469403430913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9464827040674137, + "kl": 0.765625, + "learning_rate": 1.3432234867546968e-06, + "loss": 0.0306, + "num_tokens": 2567109633.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8502176324997867, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.916987923816842, + "kl": 1.07666015625, + "learning_rate": 1.3402424952500937e-06, + "loss": 0.043, + "num_tokens": 2567681633.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8503883246564821, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2573293552476776, + "kl": 0.7744140625, + "learning_rate": 1.3372645776468018e-06, + "loss": 0.031, + "num_tokens": 2568245633.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8505590168131775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4212898113273838, + "kl": 0.5576171875, + "learning_rate": 1.3342897350018792e-06, + "loss": 0.0223, + "num_tokens": 2568816353.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8507297089698729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.963980902482054, + "kl": 0.671875, + "learning_rate": 1.3313179683712884e-06, + "loss": 0.0269, + "num_tokens": 2569384289.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8509004011265683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.620724035203289, + "kl": 0.578125, + "learning_rate": 1.3283492788098972e-06, + "loss": 0.0231, + "num_tokens": 2569951137.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8510710932832636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6988769353269496, + "kl": 0.544921875, + "learning_rate": 1.3253836673714815e-06, + "loss": 0.0218, + "num_tokens": 2570512865.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.851241785439959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5564066810650928, + "kl": 0.54833984375, + "learning_rate": 1.3224211351087291e-06, + "loss": 0.0219, + "num_tokens": 2571077681.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8514124775966544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6974995974298295, + "kl": 0.54052734375, + "learning_rate": 1.319461683073232e-06, + "loss": 0.0217, + "num_tokens": 2571643761.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8515831697533498, + "frac_reward_zero_std": 1.0, + "grad_norm": 89.13262806548036, + "kl": 5.974609375, + "learning_rate": 1.316505312315488e-06, + "loss": 0.2389, + "num_tokens": 2572207841.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8517538619100452, + "frac_reward_zero_std": 1.0, + "grad_norm": 18.03978558555703, + "kl": 1.64453125, + "learning_rate": 1.3135520238848997e-06, + "loss": 0.0659, + "num_tokens": 2572770033.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8519245540667406, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.240380539094544, + "kl": 0.515625, + "learning_rate": 1.3106018188297808e-06, + "loss": 0.0206, + "num_tokens": 2573337377.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.852095246223436, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.112021107350147, + "kl": 0.6923828125, + "learning_rate": 1.307654698197347e-06, + "loss": 0.0276, + "num_tokens": 2573907073.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8522659383801314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3765948900886992, + "kl": 0.43994140625, + "learning_rate": 1.3047106630337236e-06, + "loss": 0.0176, + "num_tokens": 2574473825.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8524366305368268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7419730331440697, + "kl": 0.59765625, + "learning_rate": 1.3017697143839291e-06, + "loss": 0.0239, + "num_tokens": 2575037697.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8526073226935222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33541019524006543, + "kl": 0.71728515625, + "learning_rate": 1.2988318532918975e-06, + "loss": 0.0287, + "num_tokens": 2575613457.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8527780148502176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7134001462047933, + "kl": 0.625, + "learning_rate": 1.2958970808004634e-06, + "loss": 0.025, + "num_tokens": 2576181457.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.852948707006913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6476242767056809, + "kl": 0.63671875, + "learning_rate": 1.292965397951369e-06, + "loss": 0.0255, + "num_tokens": 2576748097.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8531193991636085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8338837068628914, + "kl": 0.7939453125, + "learning_rate": 1.2900368057852507e-06, + "loss": 0.0317, + "num_tokens": 2577317745.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8532900913203039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.704959887554244, + "kl": 0.859375, + "learning_rate": 1.2871113053416539e-06, + "loss": 0.0344, + "num_tokens": 2577887873.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 4999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8534607834769993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8011043442189512, + "kl": 0.98046875, + "learning_rate": 1.2841888976590255e-06, + "loss": 0.0392, + "num_tokens": 2578459249.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8536314756336947, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0312179862111028, + "kl": 1.314453125, + "learning_rate": 1.2812695837747192e-06, + "loss": 0.0526, + "num_tokens": 2579026273.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.85380216779039, + "frac_reward_zero_std": 1.0, + "grad_norm": 16.750452510550875, + "kl": 1.984375, + "learning_rate": 1.2783533647249814e-06, + "loss": 0.0794, + "num_tokens": 2579590161.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8539728599470854, + "frac_reward_zero_std": 1.0, + "grad_norm": 16.742419292298884, + "kl": 1.978515625, + "learning_rate": 1.2754402415449641e-06, + "loss": 0.0791, + "num_tokens": 2580158529.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8541435521037808, + "frac_reward_zero_std": 1.0, + "grad_norm": 62.20526293916013, + "kl": 5.01171875, + "learning_rate": 1.2725302152687224e-06, + "loss": 0.2005, + "num_tokens": 2580724001.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8543142442604762, + "frac_reward_zero_std": 1.0, + "grad_norm": 26.68725662812223, + "kl": 2.98046875, + "learning_rate": 1.2696232869292136e-06, + "loss": 0.1194, + "num_tokens": 2581286177.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8544849364171716, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.002062074642911, + "kl": 1.890625, + "learning_rate": 1.2667194575582886e-06, + "loss": 0.0757, + "num_tokens": 2581850369.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.854655628573867, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6707968198871241, + "kl": 1.806640625, + "learning_rate": 1.263818728186701e-06, + "loss": 0.0722, + "num_tokens": 2582412881.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8548263207305624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.981364529876564, + "kl": 1.763671875, + "learning_rate": 1.2609210998441067e-06, + "loss": 0.0705, + "num_tokens": 2582973921.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8549970128872578, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3335453049678316, + "kl": 1.66796875, + "learning_rate": 1.2580265735590612e-06, + "loss": 0.0666, + "num_tokens": 2583538721.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8551677050439532, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.953657207096963, + "kl": 1.85546875, + "learning_rate": 1.2551351503590148e-06, + "loss": 0.0742, + "num_tokens": 2584097985.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8553383972006486, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5220714660920323, + "kl": 1.5078125, + "learning_rate": 1.2522468312703172e-06, + "loss": 0.0604, + "num_tokens": 2584665569.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.855509089357344, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9702357058285496, + "kl": 1.748046875, + "learning_rate": 1.249361617318219e-06, + "loss": 0.07, + "num_tokens": 2585229809.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8556797815140395, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2072477692493875, + "kl": 1.697265625, + "learning_rate": 1.2464795095268678e-06, + "loss": 0.0679, + "num_tokens": 2585798257.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8558504736707349, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.915237161266592, + "kl": 1.52734375, + "learning_rate": 1.2436005089193047e-06, + "loss": 0.0611, + "num_tokens": 2586363137.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8560211658274303, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5435739642884583, + "kl": 2.021484375, + "learning_rate": 1.2407246165174757e-06, + "loss": 0.0809, + "num_tokens": 2586927793.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8561918579841257, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6171445642440703, + "kl": 1.232421875, + "learning_rate": 1.2378518333422141e-06, + "loss": 0.0493, + "num_tokens": 2587495217.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8563625501408211, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3518736682445842, + "kl": 1.16015625, + "learning_rate": 1.2349821604132584e-06, + "loss": 0.0463, + "num_tokens": 2588062497.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8565332422975165, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3441292987106694, + "kl": 1.15234375, + "learning_rate": 1.232115598749234e-06, + "loss": 0.0461, + "num_tokens": 2588626817.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8567039344542118, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.915561218525826, + "kl": 1.5, + "learning_rate": 1.2292521493676724e-06, + "loss": 0.06, + "num_tokens": 2589200065.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8568746266109072, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.816265123589556, + "kl": 1.740234375, + "learning_rate": 1.2263918132849894e-06, + "loss": 0.0696, + "num_tokens": 2589765041.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8570453187676026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41908618604004105, + "kl": 1.1328125, + "learning_rate": 1.2235345915165065e-06, + "loss": 0.0453, + "num_tokens": 2590334945.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.857216010924298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8510731229792543, + "kl": 1.388671875, + "learning_rate": 1.2206804850764287e-06, + "loss": 0.0556, + "num_tokens": 2590907889.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8573867030809934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6398085464872226, + "kl": 1.55859375, + "learning_rate": 1.2178294949778657e-06, + "loss": 0.0623, + "num_tokens": 2591471425.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8575573952376888, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.408263675518697, + "kl": 1.33203125, + "learning_rate": 1.2149816222328115e-06, + "loss": 0.0533, + "num_tokens": 2592035313.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8577280873943842, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1586837276215198, + "kl": 1.333984375, + "learning_rate": 1.2121368678521628e-06, + "loss": 0.0533, + "num_tokens": 2592606273.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8578987795510796, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.963395491745929, + "kl": 1.294921875, + "learning_rate": 1.2092952328456998e-06, + "loss": 0.0518, + "num_tokens": 2593170193.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.858069471707775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6254720918690907, + "kl": 1.25, + "learning_rate": 1.2064567182221055e-06, + "loss": 0.0501, + "num_tokens": 2593736385.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8582401638644704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7946387548370504, + "kl": 1.00390625, + "learning_rate": 1.2036213249889462e-06, + "loss": 0.0401, + "num_tokens": 2594305729.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8584108560211658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6721588342243964, + "kl": 0.984375, + "learning_rate": 1.2007890541526867e-06, + "loss": 0.0394, + "num_tokens": 2594871073.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8585815481778613, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.328441800456438, + "kl": 1.2734375, + "learning_rate": 1.1979599067186776e-06, + "loss": 0.0509, + "num_tokens": 2595436129.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8587522403345567, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4291728051588424, + "kl": 0.9248046875, + "learning_rate": 1.1951338836911674e-06, + "loss": 0.037, + "num_tokens": 2596004145.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8589229324912521, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6613280963838335, + "kl": 0.9072265625, + "learning_rate": 1.1923109860732906e-06, + "loss": 0.0363, + "num_tokens": 2596568945.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8590936246479475, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0412078897733823, + "kl": 1.037109375, + "learning_rate": 1.1894912148670755e-06, + "loss": 0.0415, + "num_tokens": 2597136897.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8592643168046429, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0048190666209267, + "kl": 0.974609375, + "learning_rate": 1.1866745710734363e-06, + "loss": 0.039, + "num_tokens": 2597698577.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8594350089613382, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.6017431787592136, + "kl": 1.373046875, + "learning_rate": 1.1838610556921814e-06, + "loss": 0.0549, + "num_tokens": 2598264417.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8596057011180336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8877273451228596, + "kl": 1.130859375, + "learning_rate": 1.1810506697220104e-06, + "loss": 0.0452, + "num_tokens": 2598833089.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.859776393274729, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3499041229513509, + "kl": 1.314453125, + "learning_rate": 1.1782434141605048e-06, + "loss": 0.0526, + "num_tokens": 2599401489.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8599470854314244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5045532089150672, + "kl": 1.0458984375, + "learning_rate": 1.1754392900041388e-06, + "loss": 0.0418, + "num_tokens": 2599971377.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8601177775881198, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.9421404324673364, + "kl": 1.205078125, + "learning_rate": 1.1726382982482753e-06, + "loss": 0.0482, + "num_tokens": 2600541633.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8602884697448152, + "frac_reward_zero_std": 1.0, + "grad_norm": 50.28771185042886, + "kl": 6.1953125, + "learning_rate": 1.1698404398871699e-06, + "loss": 0.2469, + "num_tokens": 2601104209.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8604591619015106, + "frac_reward_zero_std": 1.0, + "grad_norm": 10.655509726699325, + "kl": 1.83203125, + "learning_rate": 1.167045715913957e-06, + "loss": 0.0733, + "num_tokens": 2601673553.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.860629854058206, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.7491821006800663, + "kl": 1.81640625, + "learning_rate": 1.164254127320662e-06, + "loss": 0.0727, + "num_tokens": 2602238881.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8608005462149014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6946998639876036, + "kl": 1.666015625, + "learning_rate": 1.161465675098199e-06, + "loss": 0.0667, + "num_tokens": 2602803729.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8609712383715968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.45685345792096915, + "kl": 1.619140625, + "learning_rate": 1.1586803602363717e-06, + "loss": 0.0647, + "num_tokens": 2603387761.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8611419305282922, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4892256744915313, + "kl": 1.630859375, + "learning_rate": 1.1558981837238626e-06, + "loss": 0.0652, + "num_tokens": 2603955105.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8613126226849876, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0857699394447096, + "kl": 1.580078125, + "learning_rate": 1.1531191465482417e-06, + "loss": 0.0632, + "num_tokens": 2604518513.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8614833148416831, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0601243546768124, + "kl": 1.46484375, + "learning_rate": 1.150343249695971e-06, + "loss": 0.0585, + "num_tokens": 2605080865.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8616540069983785, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.874253336668202, + "kl": 1.595703125, + "learning_rate": 1.1475704941523934e-06, + "loss": 0.0639, + "num_tokens": 2605652817.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8618246991550739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4925206575926901, + "kl": 1.177734375, + "learning_rate": 1.1448008809017376e-06, + "loss": 0.0471, + "num_tokens": 2606214209.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8619953913117693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7289343897593068, + "kl": 0.998046875, + "learning_rate": 1.142034410927111e-06, + "loss": 0.0399, + "num_tokens": 2606780177.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8621660834684646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2720540979590454, + "kl": 1.029296875, + "learning_rate": 1.1392710852105149e-06, + "loss": 0.0412, + "num_tokens": 2607347073.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.86233677562516, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0997707107273482, + "kl": 0.90625, + "learning_rate": 1.1365109047328294e-06, + "loss": 0.0363, + "num_tokens": 2607917857.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1.0, + "completions/mean_length": 2040.00390625, + "completions/mean_terminated_length": 1.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.8625074677818554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9596735483927563, + "kl": 0.9619140625, + "learning_rate": 1.1337538704738239e-06, + "loss": 0.0384, + "num_tokens": 2608487634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8626781599385508, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.607687387242837, + "kl": 1.22265625, + "learning_rate": 1.1309999834121365e-06, + "loss": 0.0489, + "num_tokens": 2609051090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8628488520952462, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8086789330442934, + "kl": 1.197265625, + "learning_rate": 1.1282492445253035e-06, + "loss": 0.0479, + "num_tokens": 2609622898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8630195442519416, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0651059259761337, + "kl": 1.16015625, + "learning_rate": 1.1255016547897358e-06, + "loss": 0.0464, + "num_tokens": 2610201010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.863190236408637, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2781945262097305, + "kl": 1.16015625, + "learning_rate": 1.122757215180732e-06, + "loss": 0.0464, + "num_tokens": 2610764482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8633609285653324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9449454019669556, + "kl": 0.984375, + "learning_rate": 1.1200159266724675e-06, + "loss": 0.0394, + "num_tokens": 2611331394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8635316207220278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5173255173146057, + "kl": 0.712890625, + "learning_rate": 1.1172777902379972e-06, + "loss": 0.0285, + "num_tokens": 2611895314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8637023128787232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42328539141458305, + "kl": 0.5517578125, + "learning_rate": 1.1145428068492635e-06, + "loss": 0.0221, + "num_tokens": 2612457426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8638730050354186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2587234498627999, + "kl": 0.34619140625, + "learning_rate": 1.1118109774770892e-06, + "loss": 0.0139, + "num_tokens": 2613025762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.864043697192114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10517389627040893, + "kl": 0.208251953125, + "learning_rate": 1.1090823030911725e-06, + "loss": 0.0083, + "num_tokens": 2613586962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8642143893488095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06073962549659687, + "kl": 0.125732421875, + "learning_rate": 1.106356784660092e-06, + "loss": 0.005, + "num_tokens": 2614154658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8643850815055049, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057428616079702205, + "kl": 0.105224609375, + "learning_rate": 1.1036344231513107e-06, + "loss": 0.0042, + "num_tokens": 2614722162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8645557736622003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0218519658321315, + "kl": 0.06689453125, + "learning_rate": 1.1009152195311711e-06, + "loss": 0.0027, + "num_tokens": 2615286786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8647264658188957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012278522578391427, + "kl": 0.05706787109375, + "learning_rate": 1.0981991747648912e-06, + "loss": 0.0023, + "num_tokens": 2615848610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.864897157975591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009492522947891404, + "kl": 0.0457763671875, + "learning_rate": 1.0954862898165642e-06, + "loss": 0.0018, + "num_tokens": 2616413154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8650678501322864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008682821310598084, + "kl": 0.0355224609375, + "learning_rate": 1.0927765656491707e-06, + "loss": 0.0014, + "num_tokens": 2616981186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8652385422889818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015917303391324425, + "kl": 0.034912109375, + "learning_rate": 1.0900700032245659e-06, + "loss": 0.0014, + "num_tokens": 2617549810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8654092344456772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0071455324064112515, + "kl": 0.03253173828125, + "learning_rate": 1.0873666035034802e-06, + "loss": 0.0013, + "num_tokens": 2618122274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8655799266023726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01033478888132906, + "kl": 0.030792236328125, + "learning_rate": 1.0846663674455205e-06, + "loss": 0.0012, + "num_tokens": 2618685394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.865750618759068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007963026013095042, + "kl": 0.02880859375, + "learning_rate": 1.0819692960091755e-06, + "loss": 0.0012, + "num_tokens": 2619253986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8659213109157634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011495338679717245, + "kl": 0.0274658203125, + "learning_rate": 1.0792753901518093e-06, + "loss": 0.0011, + "num_tokens": 2619822370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8660920030724588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024374025272428018, + "kl": 0.02667236328125, + "learning_rate": 1.0765846508296584e-06, + "loss": 0.0011, + "num_tokens": 2620388242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8662626952291542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007975048756582689, + "kl": 0.0262451171875, + "learning_rate": 1.0738970789978408e-06, + "loss": 0.001, + "num_tokens": 2620954722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8664333873858496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016122276408959555, + "kl": 0.027862548828125, + "learning_rate": 1.0712126756103458e-06, + "loss": 0.0011, + "num_tokens": 2621521810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.866604079542545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012742676504749091, + "kl": 0.024658203125, + "learning_rate": 1.0685314416200421e-06, + "loss": 0.001, + "num_tokens": 2622086146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8667747716992404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002305436508417489, + "kl": 0.023956298828125, + "learning_rate": 1.065853377978666e-06, + "loss": 0.001, + "num_tokens": 2622654226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8669454638559358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008590619017894905, + "kl": 0.02301025390625, + "learning_rate": 1.0631784856368388e-06, + "loss": 0.0009, + "num_tokens": 2623217154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8671161560126313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0059732691349669645, + "kl": 0.022430419921875, + "learning_rate": 1.060506765544047e-06, + "loss": 0.0009, + "num_tokens": 2623783506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8672868481693267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018404866308742993, + "kl": 0.022369384765625, + "learning_rate": 1.057838218648658e-06, + "loss": 0.0009, + "num_tokens": 2624348418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8674575403260221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006363513650788321, + "kl": 0.0220947265625, + "learning_rate": 1.055172845897906e-06, + "loss": 0.0009, + "num_tokens": 2624919522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8676282324827174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0049411469579437265, + "kl": 0.0225830078125, + "learning_rate": 1.0525106482379065e-06, + "loss": 0.0009, + "num_tokens": 2625491122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8677989246394128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17008791986455996, + "kl": 0.03607177734375, + "learning_rate": 1.0498516266136383e-06, + "loss": 0.0014, + "num_tokens": 2626065682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8679696167961082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004133888493754658, + "kl": 0.022491455078125, + "learning_rate": 1.0471957819689627e-06, + "loss": 0.0009, + "num_tokens": 2626638514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8681403089528036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035590422713603818, + "kl": 0.022125244140625, + "learning_rate": 1.0445431152466057e-06, + "loss": 0.0009, + "num_tokens": 2627207346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.868311001109499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016518612700301457, + "kl": 0.021026611328125, + "learning_rate": 1.0418936273881708e-06, + "loss": 0.0008, + "num_tokens": 2627777234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8684816932661944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003310787469092825, + "kl": 0.023345947265625, + "learning_rate": 1.0392473193341267e-06, + "loss": 0.0009, + "num_tokens": 2628343506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8686523854228898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00403673421010254, + "kl": 0.02294921875, + "learning_rate": 1.0366041920238223e-06, + "loss": 0.0009, + "num_tokens": 2628908690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8688230775795852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01174596190049759, + "kl": 0.023590087890625, + "learning_rate": 1.033964246395468e-06, + "loss": 0.0009, + "num_tokens": 2629468834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8689937697362806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008124573351067395, + "kl": 0.023712158203125, + "learning_rate": 1.031327483386152e-06, + "loss": 0.0009, + "num_tokens": 2630041362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.869164461892976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026503808599494354, + "kl": 0.023162841796875, + "learning_rate": 1.0286939039318277e-06, + "loss": 0.0009, + "num_tokens": 2630609618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8693351540496714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006549442976212187, + "kl": 0.02142333984375, + "learning_rate": 1.0260635089673233e-06, + "loss": 0.0009, + "num_tokens": 2631181826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8695058462063668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005944371361918947, + "kl": 0.02203369140625, + "learning_rate": 1.0234362994263315e-06, + "loss": 0.0009, + "num_tokens": 2631744050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8696765383630622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02330938424042426, + "kl": 0.02703857421875, + "learning_rate": 1.0208122762414174e-06, + "loss": 0.0011, + "num_tokens": 2632307362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8698472305197577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04077248541633171, + "kl": 0.038726806640625, + "learning_rate": 1.0181914403440175e-06, + "loss": 0.0015, + "num_tokens": 2632876962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8700179226764531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017912226870463923, + "kl": 0.025390625, + "learning_rate": 1.0155737926644305e-06, + "loss": 0.001, + "num_tokens": 2633446802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8701886148331485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012387707369357893, + "kl": 0.024505615234375, + "learning_rate": 1.0129593341318267e-06, + "loss": 0.001, + "num_tokens": 2634014882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8703593069898438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02138840770534342, + "kl": 0.024810791015625, + "learning_rate": 1.010348065674246e-06, + "loss": 0.001, + "num_tokens": 2634583634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8705299991465392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005350264334171149, + "kl": 0.023162841796875, + "learning_rate": 1.0077399882185967e-06, + "loss": 0.0009, + "num_tokens": 2635149074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8707006913032346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014094300057111972, + "kl": 0.0267333984375, + "learning_rate": 1.0051351026906486e-06, + "loss": 0.0011, + "num_tokens": 2635708690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.87087138345993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027480728183370308, + "kl": 0.0269775390625, + "learning_rate": 1.0025334100150418e-06, + "loss": 0.0011, + "num_tokens": 2636267794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8710420756166254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12131130509409124, + "kl": 0.036865234375, + "learning_rate": 9.999349111152844e-07, + "loss": 0.0015, + "num_tokens": 2636833442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8712127677733208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09420425293905411, + "kl": 0.043212890625, + "learning_rate": 9.973396069137519e-07, + "loss": 0.0017, + "num_tokens": 2637406242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8713834599300162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013738999523635883, + "kl": 0.02490234375, + "learning_rate": 9.947474983316806e-07, + "loss": 0.001, + "num_tokens": 2637968578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8715541520867116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022253353116975915, + "kl": 0.029510498046875, + "learning_rate": 9.92158586289176e-07, + "loss": 0.0012, + "num_tokens": 2638538226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.871724844243407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020914027362488354, + "kl": 0.02911376953125, + "learning_rate": 9.895728717052078e-07, + "loss": 0.0012, + "num_tokens": 2639103106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8718955364001024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05242094499125242, + "kl": 0.032501220703125, + "learning_rate": 9.869903554976146e-07, + "loss": 0.0013, + "num_tokens": 2639676114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8720662285567978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18600885277139725, + "kl": 0.04144287109375, + "learning_rate": 9.844110385830952e-07, + "loss": 0.0017, + "num_tokens": 2640243298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8722369207134932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02379725117804128, + "kl": 0.031341552734375, + "learning_rate": 9.818349218772094e-07, + "loss": 0.0013, + "num_tokens": 2640812610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8724076128701886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019274727602355025, + "kl": 0.03155517578125, + "learning_rate": 9.792620062943902e-07, + "loss": 0.0013, + "num_tokens": 2641385474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.872578305026884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02937983537131474, + "kl": 0.0361328125, + "learning_rate": 9.766922927479293e-07, + "loss": 0.0014, + "num_tokens": 2641953138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8727489971835795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020795893166769944, + "kl": 0.0357666015625, + "learning_rate": 9.74125782149985e-07, + "loss": 0.0014, + "num_tokens": 2642515058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8729196893402749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04889133130672479, + "kl": 0.04132080078125, + "learning_rate": 9.715624754115683e-07, + "loss": 0.0017, + "num_tokens": 2643080930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8730903814969703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034930517553642945, + "kl": 0.04913330078125, + "learning_rate": 9.690023734425658e-07, + "loss": 0.002, + "num_tokens": 2643658818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8732610736536656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025913884878764122, + "kl": 0.04974365234375, + "learning_rate": 9.664454771517206e-07, + "loss": 0.002, + "num_tokens": 2644221650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.873431765810361, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.479882021069196, + "kl": 0.14202880859375, + "learning_rate": 9.638917874466392e-07, + "loss": 0.0057, + "num_tokens": 2644787154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8736024579670564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02322906543067606, + "kl": 0.0552978515625, + "learning_rate": 9.613413052337894e-07, + "loss": 0.0022, + "num_tokens": 2645352978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8737731501237518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29768160085440665, + "kl": 0.0777587890625, + "learning_rate": 9.587940314184973e-07, + "loss": 0.0031, + "num_tokens": 2645921698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8739438422804472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21043065889607737, + "kl": 0.0887451171875, + "learning_rate": 9.562499669049552e-07, + "loss": 0.0035, + "num_tokens": 2646488386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8741145344371426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08481858349018573, + "kl": 0.1103515625, + "learning_rate": 9.537091125962162e-07, + "loss": 0.0044, + "num_tokens": 2647051378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.874285226593838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12027398192550964, + "kl": 0.1290283203125, + "learning_rate": 9.511714693941898e-07, + "loss": 0.0052, + "num_tokens": 2647614818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8744559187505334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06388324570029125, + "kl": 0.08984375, + "learning_rate": 9.486370381996457e-07, + "loss": 0.0036, + "num_tokens": 2648181698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8746266109072288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09301448581044856, + "kl": 0.115478515625, + "learning_rate": 9.461058199122187e-07, + "loss": 0.0046, + "num_tokens": 2648747586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8747973030639242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15109068012015184, + "kl": 0.148681640625, + "learning_rate": 9.435778154303998e-07, + "loss": 0.0059, + "num_tokens": 2649313746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8749679952206196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12655411613335343, + "kl": 0.1466064453125, + "learning_rate": 9.410530256515382e-07, + "loss": 0.0059, + "num_tokens": 2649880498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875138687377315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6557894777059036, + "kl": 0.16650390625, + "learning_rate": 9.385314514718413e-07, + "loss": 0.0067, + "num_tokens": 2650460466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8753093795340104, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.886582172051661, + "kl": 0.201171875, + "learning_rate": 9.360130937863799e-07, + "loss": 0.008, + "num_tokens": 2651033586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8754800716907059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23134979126100713, + "kl": 0.2059326171875, + "learning_rate": 9.334979534890798e-07, + "loss": 0.0082, + "num_tokens": 2651601122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8756507638474013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.537021286722695, + "kl": 0.3115234375, + "learning_rate": 9.309860314727249e-07, + "loss": 0.0125, + "num_tokens": 2652164162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8758214560040967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37519296944091757, + "kl": 0.298828125, + "learning_rate": 9.28477328628954e-07, + "loss": 0.012, + "num_tokens": 2652729890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.875992148160792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13325987202594267, + "kl": 0.232421875, + "learning_rate": 9.259718458482681e-07, + "loss": 0.0093, + "num_tokens": 2653298946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8761628403174874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6244292199427907, + "kl": 0.314697265625, + "learning_rate": 9.234695840200259e-07, + "loss": 0.0126, + "num_tokens": 2653866898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8763335324741828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4924902011289629, + "kl": 0.42041015625, + "learning_rate": 9.209705440324368e-07, + "loss": 0.0168, + "num_tokens": 2654432034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8765042246308782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7341320779830532, + "kl": 0.5498046875, + "learning_rate": 9.184747267725691e-07, + "loss": 0.022, + "num_tokens": 2654994530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8766749167875736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4060809012182464, + "kl": 0.685546875, + "learning_rate": 9.159821331263497e-07, + "loss": 0.0274, + "num_tokens": 2655558674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.876845608944269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8797112481599392, + "kl": 0.849609375, + "learning_rate": 9.134927639785596e-07, + "loss": 0.034, + "num_tokens": 2656119282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8770163011009644, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.050478890553221, + "kl": 0.916015625, + "learning_rate": 9.110066202128321e-07, + "loss": 0.0367, + "num_tokens": 2656683874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8771869932576598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6410574947309295, + "kl": 0.96875, + "learning_rate": 9.085237027116634e-07, + "loss": 0.0387, + "num_tokens": 2657252290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8773576854143552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5870606060764266, + "kl": 0.92578125, + "learning_rate": 9.060440123563941e-07, + "loss": 0.037, + "num_tokens": 2657821746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8775283775710506, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3668660811886721, + "kl": 1.21484375, + "learning_rate": 9.0356755002723e-07, + "loss": 0.0486, + "num_tokens": 2658384002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.877699069727746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6736100412288358, + "kl": 1.1337890625, + "learning_rate": 9.010943166032216e-07, + "loss": 0.0453, + "num_tokens": 2658945234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8778697618844414, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1416318184126422, + "kl": 1.095703125, + "learning_rate": 8.986243129622807e-07, + "loss": 0.0438, + "num_tokens": 2659512258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8780404540411368, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1015253752490481, + "kl": 0.92578125, + "learning_rate": 8.961575399811661e-07, + "loss": 0.037, + "num_tokens": 2660084834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8782111461978322, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1250672105372503, + "kl": 0.9150390625, + "learning_rate": 8.936939985354964e-07, + "loss": 0.0366, + "num_tokens": 2660651266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8783818383545277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7728637071972374, + "kl": 0.8046875, + "learning_rate": 8.912336894997365e-07, + "loss": 0.0322, + "num_tokens": 2661216098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8785525305112231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.454998847879922, + "kl": 0.6904296875, + "learning_rate": 8.887766137472121e-07, + "loss": 0.0276, + "num_tokens": 2661779330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8787232226679184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7025339734699063, + "kl": 0.6171875, + "learning_rate": 8.863227721500911e-07, + "loss": 0.0247, + "num_tokens": 2662355074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8788939148246138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5347354844910257, + "kl": 0.4765625, + "learning_rate": 8.838721655794025e-07, + "loss": 0.0191, + "num_tokens": 2662922242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8790646069813092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7519300747658557, + "kl": 0.42626953125, + "learning_rate": 8.814247949050203e-07, + "loss": 0.017, + "num_tokens": 2663486498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8792352991380046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2928783503521023, + "kl": 0.3447265625, + "learning_rate": 8.789806609956763e-07, + "loss": 0.0138, + "num_tokens": 2664046146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8794059912947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09061597758085566, + "kl": 0.3603515625, + "learning_rate": 8.765397647189467e-07, + "loss": 0.0144, + "num_tokens": 2664614402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8795766834513954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48148728284986947, + "kl": 0.3486328125, + "learning_rate": 8.741021069412636e-07, + "loss": 0.0139, + "num_tokens": 2665176194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8797473756080908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23933864839635055, + "kl": 0.243408203125, + "learning_rate": 8.71667688527904e-07, + "loss": 0.0097, + "num_tokens": 2665744626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8799180677647862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20728196428345813, + "kl": 0.254150390625, + "learning_rate": 8.692365103430022e-07, + "loss": 0.0102, + "num_tokens": 2666328322.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8800887599214816, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05270446704700655, + "kl": 0.21337890625, + "learning_rate": 8.668085732495391e-07, + "loss": 0.0085, + "num_tokens": 2666896930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.880259452078177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03398543371699332, + "kl": 0.14111328125, + "learning_rate": 8.643838781093427e-07, + "loss": 0.0056, + "num_tokens": 2667464242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8804301442348724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07758521398972168, + "kl": 0.169921875, + "learning_rate": 8.619624257830916e-07, + "loss": 0.0068, + "num_tokens": 2668028082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8806008363915678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08412213297554871, + "kl": 0.154296875, + "learning_rate": 8.595442171303147e-07, + "loss": 0.0062, + "num_tokens": 2668591938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8807715285482632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06509843277341139, + "kl": 0.150146484375, + "learning_rate": 8.571292530093911e-07, + "loss": 0.006, + "num_tokens": 2669153058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8809422207049586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0677580656555693, + "kl": 0.176025390625, + "learning_rate": 8.547175342775449e-07, + "loss": 0.007, + "num_tokens": 2669720946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.881112912861654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619430581117393, + "kl": 0.149169921875, + "learning_rate": 8.523090617908458e-07, + "loss": 0.006, + "num_tokens": 2670295682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8812836050183495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03308283769513717, + "kl": 0.1497802734375, + "learning_rate": 8.499038364042167e-07, + "loss": 0.006, + "num_tokens": 2670864530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8814542971750448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045640833470583494, + "kl": 0.1151123046875, + "learning_rate": 8.475018589714279e-07, + "loss": 0.0046, + "num_tokens": 2671432642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8816249893317402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05959811088847402, + "kl": 0.11669921875, + "learning_rate": 8.45103130345094e-07, + "loss": 0.0047, + "num_tokens": 2672001442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8817956814884356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04077822687814194, + "kl": 0.140380859375, + "learning_rate": 8.42707651376673e-07, + "loss": 0.0056, + "num_tokens": 2672573858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.881966373645131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05613381669728037, + "kl": 0.1375732421875, + "learning_rate": 8.403154229164779e-07, + "loss": 0.0055, + "num_tokens": 2673135810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8821370658018264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030826396997943352, + "kl": 0.14404296875, + "learning_rate": 8.379264458136627e-07, + "loss": 0.0058, + "num_tokens": 2673704546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8823077579585218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029828057192338303, + "kl": 0.11865234375, + "learning_rate": 8.35540720916228e-07, + "loss": 0.0047, + "num_tokens": 2674264706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8824784501152172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06893887521473593, + "kl": 0.130859375, + "learning_rate": 8.331582490710166e-07, + "loss": 0.0052, + "num_tokens": 2674830002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8826491422719126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052601165071688875, + "kl": 0.1650390625, + "learning_rate": 8.30779031123724e-07, + "loss": 0.0066, + "num_tokens": 2675392354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.882819834428608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.172473035207115, + "kl": 0.173095703125, + "learning_rate": 8.284030679188871e-07, + "loss": 0.0069, + "num_tokens": 2675958018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8829905265853034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10887248035203502, + "kl": 0.129638671875, + "learning_rate": 8.260303602998842e-07, + "loss": 0.0052, + "num_tokens": 2676525282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8831612187419988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06180652489989087, + "kl": 0.17724609375, + "learning_rate": 8.23660909108942e-07, + "loss": 0.0071, + "num_tokens": 2677088786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8833319108986942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1561450207082621, + "kl": 0.214599609375, + "learning_rate": 8.212947151871287e-07, + "loss": 0.0086, + "num_tokens": 2677653714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8835026030553896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10472805857130589, + "kl": 0.22265625, + "learning_rate": 8.189317793743623e-07, + "loss": 0.0089, + "num_tokens": 2678217106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.883673295212085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10558372429817903, + "kl": 0.181640625, + "learning_rate": 8.165721025093965e-07, + "loss": 0.0073, + "num_tokens": 2678784850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8838439873687804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05840828719101342, + "kl": 0.2060546875, + "learning_rate": 8.142156854298289e-07, + "loss": 0.0082, + "num_tokens": 2679354482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8840146795254759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21384039244197944, + "kl": 0.189453125, + "learning_rate": 8.118625289721061e-07, + "loss": 0.0076, + "num_tokens": 2679918354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8841853716821712, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03247549885683408, + "kl": 0.1884765625, + "learning_rate": 8.095126339715142e-07, + "loss": 0.0075, + "num_tokens": 2680493218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8843560638388666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11413137987105249, + "kl": 0.23876953125, + "learning_rate": 8.071660012621785e-07, + "loss": 0.0095, + "num_tokens": 2681054466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.884526755995562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20722873105251804, + "kl": 0.26123046875, + "learning_rate": 8.048226316770713e-07, + "loss": 0.0104, + "num_tokens": 2681616898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8846974481522574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07781129390494408, + "kl": 0.2529296875, + "learning_rate": 8.024825260480019e-07, + "loss": 0.0101, + "num_tokens": 2682182610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8848681403089528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0985010032437895, + "kl": 0.27685546875, + "learning_rate": 8.001456852056255e-07, + "loss": 0.0111, + "num_tokens": 2682740386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8850388324656482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13070058006110574, + "kl": 0.2568359375, + "learning_rate": 7.978121099794334e-07, + "loss": 0.0103, + "num_tokens": 2683306146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8852095246223436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14727145435839936, + "kl": 0.2724609375, + "learning_rate": 7.954818011977639e-07, + "loss": 0.0109, + "num_tokens": 2683870418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.885380216779039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14589345544965895, + "kl": 0.29833984375, + "learning_rate": 7.931547596877898e-07, + "loss": 0.0119, + "num_tokens": 2684432978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8855509089357344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1378120056174524, + "kl": 0.3095703125, + "learning_rate": 7.908309862755292e-07, + "loss": 0.0124, + "num_tokens": 2684998194.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8857216010924298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15616013834666112, + "kl": 0.280517578125, + "learning_rate": 7.885104817858358e-07, + "loss": 0.0112, + "num_tokens": 2685563154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8858922932491252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14560699713215608, + "kl": 0.3359375, + "learning_rate": 7.861932470424061e-07, + "loss": 0.0134, + "num_tokens": 2686121554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8860629854058206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11324708989123769, + "kl": 0.32861328125, + "learning_rate": 7.838792828677732e-07, + "loss": 0.0132, + "num_tokens": 2686686498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.886233677562516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16208196188224286, + "kl": 0.3603515625, + "learning_rate": 7.815685900833147e-07, + "loss": 0.0144, + "num_tokens": 2687248370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8864043697192114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10552785832546008, + "kl": 0.298828125, + "learning_rate": 7.792611695092378e-07, + "loss": 0.012, + "num_tokens": 2687817202.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8865750618759068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25774674107141715, + "kl": 0.3525390625, + "learning_rate": 7.769570219645984e-07, + "loss": 0.0141, + "num_tokens": 2688379250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8867457540326023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10775681133944448, + "kl": 0.35693359375, + "learning_rate": 7.74656148267281e-07, + "loss": 0.0143, + "num_tokens": 2688941842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8869164461892975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11472459042817505, + "kl": 0.314697265625, + "learning_rate": 7.723585492340169e-07, + "loss": 0.0126, + "num_tokens": 2689508178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.887087138345993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04023783457886491, + "kl": 0.289306640625, + "learning_rate": 7.700642256803681e-07, + "loss": 0.0116, + "num_tokens": 2690071554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8872578305026884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06709864264008329, + "kl": 0.34521484375, + "learning_rate": 7.677731784207377e-07, + "loss": 0.0138, + "num_tokens": 2690640354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8874285226593838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06433823910793934, + "kl": 0.303955078125, + "learning_rate": 7.654854082683649e-07, + "loss": 0.0121, + "num_tokens": 2691204882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8875992148160792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5681818716269535, + "kl": 0.35888671875, + "learning_rate": 7.632009160353271e-07, + "loss": 0.0143, + "num_tokens": 2691768018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8877699069727746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30592647786778443, + "kl": 0.3271484375, + "learning_rate": 7.609197025325321e-07, + "loss": 0.0131, + "num_tokens": 2692332930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.88794059912947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050884020531839, + "kl": 0.3232421875, + "learning_rate": 7.586417685697311e-07, + "loss": 0.0129, + "num_tokens": 2692895666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8881112912861654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17268758771723836, + "kl": 0.283447265625, + "learning_rate": 7.563671149555108e-07, + "loss": 0.0113, + "num_tokens": 2693462114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8882819834428608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11894833930449232, + "kl": 0.37451171875, + "learning_rate": 7.540957424972883e-07, + "loss": 0.015, + "num_tokens": 2694024914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8884526755995562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12103372846405315, + "kl": 0.37353515625, + "learning_rate": 7.518276520013179e-07, + "loss": 0.0149, + "num_tokens": 2694590274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8886233677562516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17304965906156433, + "kl": 0.3271484375, + "learning_rate": 7.495628442726899e-07, + "loss": 0.0131, + "num_tokens": 2695153746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.888794059912947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19730651829080031, + "kl": 0.40185546875, + "learning_rate": 7.473013201153334e-07, + "loss": 0.0161, + "num_tokens": 2695717602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8889647520696424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16554035102465262, + "kl": 0.35205078125, + "learning_rate": 7.450430803320052e-07, + "loss": 0.0141, + "num_tokens": 2696280082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8891354442263378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053987683357164806, + "kl": 0.324951171875, + "learning_rate": 7.427881257242964e-07, + "loss": 0.013, + "num_tokens": 2696846850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8893061363830332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11043223319626484, + "kl": 0.30859375, + "learning_rate": 7.405364570926376e-07, + "loss": 0.0123, + "num_tokens": 2697409234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8894768285397286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0415996381167264, + "kl": 0.3310546875, + "learning_rate": 7.382880752362897e-07, + "loss": 0.0132, + "num_tokens": 2697970498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.889647520696424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17504381637672117, + "kl": 0.36279296875, + "learning_rate": 7.360429809533465e-07, + "loss": 0.0145, + "num_tokens": 2698535650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8898182128531194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29815607934506666, + "kl": 0.3505859375, + "learning_rate": 7.338011750407348e-07, + "loss": 0.014, + "num_tokens": 2699097922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8899889050098148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09250776107064153, + "kl": 0.276123046875, + "learning_rate": 7.315626582942148e-07, + "loss": 0.011, + "num_tokens": 2699663890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8901595971665102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22807903617750433, + "kl": 0.3564453125, + "learning_rate": 7.293274315083798e-07, + "loss": 0.0143, + "num_tokens": 2700232642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8903302893232056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2627357871875343, + "kl": 0.35791015625, + "learning_rate": 7.270954954766574e-07, + "loss": 0.0143, + "num_tokens": 2700795362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.890500981479901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08380412624192712, + "kl": 0.35595703125, + "learning_rate": 7.248668509912992e-07, + "loss": 0.0143, + "num_tokens": 2701357138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8906716736365964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14209513456730175, + "kl": 0.27587890625, + "learning_rate": 7.22641498843395e-07, + "loss": 0.011, + "num_tokens": 2701923154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8908423657932918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19945425502619804, + "kl": 0.3701171875, + "learning_rate": 7.204194398228658e-07, + "loss": 0.0148, + "num_tokens": 2702485282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8910130579499872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08696318208959956, + "kl": 0.26220703125, + "learning_rate": 7.182006747184634e-07, + "loss": 0.0105, + "num_tokens": 2703051074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8911837501066826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13226285747954883, + "kl": 0.292236328125, + "learning_rate": 7.159852043177673e-07, + "loss": 0.0117, + "num_tokens": 2703623794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.891354442263378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3315310312160121, + "kl": 0.252197265625, + "learning_rate": 7.137730294071887e-07, + "loss": 0.0101, + "num_tokens": 2704187970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8915251344200734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15865599241578465, + "kl": 0.314453125, + "learning_rate": 7.115641507719707e-07, + "loss": 0.0126, + "num_tokens": 2704749474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8916958265767688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09148987236021698, + "kl": 0.2275390625, + "learning_rate": 7.09358569196188e-07, + "loss": 0.0091, + "num_tokens": 2705314706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8918665187334642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19305112949225842, + "kl": 0.28466796875, + "learning_rate": 7.0715628546274e-07, + "loss": 0.0114, + "num_tokens": 2705882162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8920372108901596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.954918695987276, + "kl": 0.39013671875, + "learning_rate": 7.049573003533572e-07, + "loss": 0.0156, + "num_tokens": 2706452466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.892207903046855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13742189350214964, + "kl": 0.27880859375, + "learning_rate": 7.027616146486005e-07, + "loss": 0.0111, + "num_tokens": 2707020098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8923785952035505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11412446120461969, + "kl": 0.262939453125, + "learning_rate": 7.00569229127861e-07, + "loss": 0.0105, + "num_tokens": 2707582354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8925492873602457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27218452725252695, + "kl": 0.31982421875, + "learning_rate": 6.983801445693561e-07, + "loss": 0.0128, + "num_tokens": 2708147858.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8927199795169412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23618389378640972, + "kl": 0.31201171875, + "learning_rate": 6.961943617501277e-07, + "loss": 0.0125, + "num_tokens": 2708712802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8928906716736366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13196995570434839, + "kl": 0.294677734375, + "learning_rate": 6.94011881446054e-07, + "loss": 0.0118, + "num_tokens": 2709283810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.893061363830332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17440551165884383, + "kl": 0.3271484375, + "learning_rate": 6.918327044318363e-07, + "loss": 0.0131, + "num_tokens": 2709850114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8932320559870274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1022028931592686, + "kl": 0.32861328125, + "learning_rate": 6.896568314810026e-07, + "loss": 0.0131, + "num_tokens": 2710417010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8934027481437228, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20157080095158064, + "kl": 0.308349609375, + "learning_rate": 6.874842633659085e-07, + "loss": 0.0123, + "num_tokens": 2710979154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8935734403004182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05991519664583207, + "kl": 0.27685546875, + "learning_rate": 6.853150008577391e-07, + "loss": 0.0111, + "num_tokens": 2711544178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8937441324571136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2098473121521763, + "kl": 0.3466796875, + "learning_rate": 6.831490447265043e-07, + "loss": 0.0139, + "num_tokens": 2712113154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.893914824613809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15057479914979735, + "kl": 0.266357421875, + "learning_rate": 6.809863957410401e-07, + "loss": 0.0107, + "num_tokens": 2712673746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8940855167705044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17666861848438942, + "kl": 0.34521484375, + "learning_rate": 6.788270546690068e-07, + "loss": 0.0138, + "num_tokens": 2713248978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8942562089271998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19246410916660975, + "kl": 0.349609375, + "learning_rate": 6.766710222768935e-07, + "loss": 0.014, + "num_tokens": 2713814242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8944269010838952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3107961558655456, + "kl": 0.3466796875, + "learning_rate": 6.745182993300158e-07, + "loss": 0.0139, + "num_tokens": 2714387602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8945975932405906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04853556373451275, + "kl": 0.29345703125, + "learning_rate": 6.723688865925105e-07, + "loss": 0.0117, + "num_tokens": 2714954002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.894768285397286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07744490166823505, + "kl": 0.33349609375, + "learning_rate": 6.702227848273435e-07, + "loss": 0.0133, + "num_tokens": 2715521522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8949389775539814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2824673375632989, + "kl": 0.3701171875, + "learning_rate": 6.680799947963012e-07, + "loss": 0.0148, + "num_tokens": 2716092706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8951096697106768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10240958500033126, + "kl": 0.40087890625, + "learning_rate": 6.659405172599986e-07, + "loss": 0.016, + "num_tokens": 2716659810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8952803618673721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46989464366013717, + "kl": 0.44775390625, + "learning_rate": 6.638043529778715e-07, + "loss": 0.0179, + "num_tokens": 2717223634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8954510540240675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1263220716502808, + "kl": 0.41064453125, + "learning_rate": 6.616715027081833e-07, + "loss": 0.0164, + "num_tokens": 2717790146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.895621746180763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15109369656373126, + "kl": 0.40185546875, + "learning_rate": 6.59541967208015e-07, + "loss": 0.0161, + "num_tokens": 2718352274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8957924383374584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09752004565966702, + "kl": 0.37353515625, + "learning_rate": 6.574157472332787e-07, + "loss": 0.0149, + "num_tokens": 2718918498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8959631304941538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27357132929421657, + "kl": 0.43896484375, + "learning_rate": 6.55292843538703e-07, + "loss": 0.0176, + "num_tokens": 2719485138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8961338226508492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19341284799760036, + "kl": 0.35986328125, + "learning_rate": 6.531732568778449e-07, + "loss": 0.0144, + "num_tokens": 2720049666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8963045148075446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33779003057142476, + "kl": 0.36572265625, + "learning_rate": 6.510569880030782e-07, + "loss": 0.0146, + "num_tokens": 2720614818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.89647520696424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4431795224217573, + "kl": 0.45654296875, + "learning_rate": 6.489440376656042e-07, + "loss": 0.0183, + "num_tokens": 2721187666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8966458991209354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050898702711197966, + "kl": 0.36767578125, + "learning_rate": 6.468344066154419e-07, + "loss": 0.0147, + "num_tokens": 2721752242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8968165912776308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16984879772138353, + "kl": 0.384765625, + "learning_rate": 6.447280956014368e-07, + "loss": 0.0154, + "num_tokens": 2722316738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8969872834343262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.45289614014688745, + "kl": 0.39013671875, + "learning_rate": 6.426251053712506e-07, + "loss": 0.0156, + "num_tokens": 2722883586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8971579755910216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06716743820245266, + "kl": 0.4111328125, + "learning_rate": 6.40525436671372e-07, + "loss": 0.0164, + "num_tokens": 2723444514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.897328667747717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0692689275177475, + "kl": 0.3681640625, + "learning_rate": 6.384290902471035e-07, + "loss": 0.0147, + "num_tokens": 2724008562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8974993599044124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18082410036936797, + "kl": 0.40966796875, + "learning_rate": 6.363360668425744e-07, + "loss": 0.0164, + "num_tokens": 2724569794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8976700520611078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3131202122383894, + "kl": 0.42431640625, + "learning_rate": 6.342463672007348e-07, + "loss": 0.017, + "num_tokens": 2725140258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8978407442178032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13739362254121018, + "kl": 0.46484375, + "learning_rate": 6.321599920633503e-07, + "loss": 0.0186, + "num_tokens": 2725707762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8980114363744985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10523225663081284, + "kl": 0.36181640625, + "learning_rate": 6.300769421710084e-07, + "loss": 0.0145, + "num_tokens": 2726272994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.898182128531194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09842954043854185, + "kl": 0.37646484375, + "learning_rate": 6.279972182631166e-07, + "loss": 0.0151, + "num_tokens": 2726837362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8983528206878894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2660302058958767, + "kl": 0.40380859375, + "learning_rate": 6.259208210779033e-07, + "loss": 0.0162, + "num_tokens": 2727406978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8985235128445848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09006211710751712, + "kl": 0.37255859375, + "learning_rate": 6.238477513524144e-07, + "loss": 0.0149, + "num_tokens": 2727970834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8986942050012802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1822461619076471, + "kl": 0.396484375, + "learning_rate": 6.217780098225124e-07, + "loss": 0.0158, + "num_tokens": 2728539490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8988648971579756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11651360218186897, + "kl": 0.44384765625, + "learning_rate": 6.197115972228829e-07, + "loss": 0.0178, + "num_tokens": 2729103362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.899035589314671, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05040469535682722, + "kl": 0.4423828125, + "learning_rate": 6.176485142870281e-07, + "loss": 0.0177, + "num_tokens": 2729665490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8992062814713664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1627831411058642, + "kl": 0.4267578125, + "learning_rate": 6.155887617472678e-07, + "loss": 0.0171, + "num_tokens": 2730232770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8993769736280618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10423861506048948, + "kl": 0.45361328125, + "learning_rate": 6.135323403347381e-07, + "loss": 0.0182, + "num_tokens": 2730798802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8995476657847572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09958341935230222, + "kl": 0.3623046875, + "learning_rate": 6.114792507793954e-07, + "loss": 0.0145, + "num_tokens": 2731361090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8997183579414526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17704570307677567, + "kl": 0.3935546875, + "learning_rate": 6.094294938100143e-07, + "loss": 0.0157, + "num_tokens": 2731933634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.899889050098148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17572869877766895, + "kl": 0.41015625, + "learning_rate": 6.073830701541816e-07, + "loss": 0.0164, + "num_tokens": 2732498018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9000597422548434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1070133753499159, + "kl": 0.3837890625, + "learning_rate": 6.053399805383042e-07, + "loss": 0.0153, + "num_tokens": 2733064930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9002304344115388, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2390121924938573, + "kl": 0.6259765625, + "learning_rate": 6.033002256876064e-07, + "loss": 0.025, + "num_tokens": 2733633794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9004011265682342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12048698341107765, + "kl": 0.4248046875, + "learning_rate": 6.012638063261278e-07, + "loss": 0.017, + "num_tokens": 2734201682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9005718187249296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17587350028718537, + "kl": 0.4736328125, + "learning_rate": 5.992307231767236e-07, + "loss": 0.0189, + "num_tokens": 2734766738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9007425108816249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1265523759183971, + "kl": 0.47607421875, + "learning_rate": 5.972009769610632e-07, + "loss": 0.019, + "num_tokens": 2735338802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9009132030383203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.145727480199831, + "kl": 0.47216796875, + "learning_rate": 5.951745683996335e-07, + "loss": 0.0189, + "num_tokens": 2735902466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9010838951950157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38082018410844376, + "kl": 0.494140625, + "learning_rate": 5.931514982117381e-07, + "loss": 0.0198, + "num_tokens": 2736469378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9012545873517112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42679139392315574, + "kl": 0.57421875, + "learning_rate": 5.911317671154959e-07, + "loss": 0.023, + "num_tokens": 2737045298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9014252795084066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22016652696905817, + "kl": 0.54736328125, + "learning_rate": 5.891153758278323e-07, + "loss": 0.0219, + "num_tokens": 2737612418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.901595971665102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3538677251558711, + "kl": 0.5244140625, + "learning_rate": 5.871023250644981e-07, + "loss": 0.021, + "num_tokens": 2738174738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9017666638217974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11675688030084065, + "kl": 0.47314453125, + "learning_rate": 5.850926155400504e-07, + "loss": 0.0189, + "num_tokens": 2738742850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9019373559784928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22257447427061375, + "kl": 0.47705078125, + "learning_rate": 5.830862479678689e-07, + "loss": 0.0191, + "num_tokens": 2739307714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9021080481351882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19801840584386682, + "kl": 0.5185546875, + "learning_rate": 5.810832230601371e-07, + "loss": 0.0207, + "num_tokens": 2739883522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9022787402918836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28148387103102906, + "kl": 0.48193359375, + "learning_rate": 5.790835415278573e-07, + "loss": 0.0193, + "num_tokens": 2740449362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.902449432448579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07934025416627283, + "kl": 0.419921875, + "learning_rate": 5.770872040808439e-07, + "loss": 0.0168, + "num_tokens": 2741016946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9026201246052744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1920282357528701, + "kl": 0.53466796875, + "learning_rate": 5.750942114277269e-07, + "loss": 0.0214, + "num_tokens": 2741584546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9027908167619698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16624030625779884, + "kl": 0.50927734375, + "learning_rate": 5.731045642759459e-07, + "loss": 0.0204, + "num_tokens": 2742149106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9029615089186652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13150226326588924, + "kl": 0.5009765625, + "learning_rate": 5.711182633317514e-07, + "loss": 0.02, + "num_tokens": 2742712882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9031322010753606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27188557897718585, + "kl": 0.572265625, + "learning_rate": 5.691353093002106e-07, + "loss": 0.0229, + "num_tokens": 2743281378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.903302893232056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1676104445254675, + "kl": 0.42333984375, + "learning_rate": 5.671557028852015e-07, + "loss": 0.0169, + "num_tokens": 2743853890.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9034735853887513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23652826106710031, + "kl": 0.50439453125, + "learning_rate": 5.65179444789411e-07, + "loss": 0.0202, + "num_tokens": 2744417362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9036442775454467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1869673123742705, + "kl": 0.4638671875, + "learning_rate": 5.632065357143401e-07, + "loss": 0.0185, + "num_tokens": 2744984610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9038149697021421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06082798002745592, + "kl": 0.46826171875, + "learning_rate": 5.612369763602987e-07, + "loss": 0.0187, + "num_tokens": 2745555954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9039856618588376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2029212177383842, + "kl": 0.50390625, + "learning_rate": 5.592707674264142e-07, + "loss": 0.0202, + "num_tokens": 2746121330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.904156354015533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13707334313203556, + "kl": 0.4423828125, + "learning_rate": 5.573079096106149e-07, + "loss": 0.0177, + "num_tokens": 2746690370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9043270461722284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05691139209487419, + "kl": 0.4189453125, + "learning_rate": 5.553484036096458e-07, + "loss": 0.0168, + "num_tokens": 2747257682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9044977383289238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3725873760789345, + "kl": 0.46826171875, + "learning_rate": 5.533922501190602e-07, + "loss": 0.0187, + "num_tokens": 2747820930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9046684304856192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19423524154834573, + "kl": 0.40625, + "learning_rate": 5.514394498332254e-07, + "loss": 0.0163, + "num_tokens": 2748384898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9048391226423146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19217796564100295, + "kl": 0.41357421875, + "learning_rate": 5.494900034453099e-07, + "loss": 0.0165, + "num_tokens": 2748945250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.90500981479901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06731664368213658, + "kl": 0.45458984375, + "learning_rate": 5.475439116473014e-07, + "loss": 0.0182, + "num_tokens": 2749510498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9051805069557054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10522574332578657, + "kl": 0.388671875, + "learning_rate": 5.456011751299884e-07, + "loss": 0.0155, + "num_tokens": 2750073826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9053511991124008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30430270413563154, + "kl": 0.41015625, + "learning_rate": 5.436617945829758e-07, + "loss": 0.0164, + "num_tokens": 2750642274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9055218912690962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.447289672141984, + "kl": 0.376953125, + "learning_rate": 5.4172577069467e-07, + "loss": 0.0151, + "num_tokens": 2751206802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9056925834257916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29038199918560975, + "kl": 0.392578125, + "learning_rate": 5.397931041522942e-07, + "loss": 0.0157, + "num_tokens": 2751773186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.905863275582487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24791292155862227, + "kl": 0.4384765625, + "learning_rate": 5.378637956418697e-07, + "loss": 0.0175, + "num_tokens": 2752335266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9060339677391824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11018756698928649, + "kl": 0.3671875, + "learning_rate": 5.359378458482367e-07, + "loss": 0.0147, + "num_tokens": 2752894530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9062046598958778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8902024062673894, + "kl": 0.498046875, + "learning_rate": 5.340152554550326e-07, + "loss": 0.0199, + "num_tokens": 2753467186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9063753520525731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25702368044160173, + "kl": 0.40625, + "learning_rate": 5.320960251447127e-07, + "loss": 0.0163, + "num_tokens": 2754030818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9065460442092685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17808664510081917, + "kl": 0.44384765625, + "learning_rate": 5.301801555985309e-07, + "loss": 0.0178, + "num_tokens": 2754591522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.906716736365964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3875696161485425, + "kl": 0.47607421875, + "learning_rate": 5.282676474965543e-07, + "loss": 0.019, + "num_tokens": 2755152882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9068874285226594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3658357578629344, + "kl": 0.44580078125, + "learning_rate": 5.26358501517652e-07, + "loss": 0.0178, + "num_tokens": 2755714050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9070581206793548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17776465241210965, + "kl": 0.45947265625, + "learning_rate": 5.244527183395032e-07, + "loss": 0.0184, + "num_tokens": 2756288658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9072288128360502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38032083388287213, + "kl": 0.4873046875, + "learning_rate": 5.225502986385911e-07, + "loss": 0.0195, + "num_tokens": 2756855346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9073995049927456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12500925591058332, + "kl": 0.4833984375, + "learning_rate": 5.206512430902077e-07, + "loss": 0.0193, + "num_tokens": 2757419346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.907570197149441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3207790489470742, + "kl": 0.4619140625, + "learning_rate": 5.187555523684473e-07, + "loss": 0.0185, + "num_tokens": 2757986018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9077408893061364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1577265168912342, + "kl": 0.43408203125, + "learning_rate": 5.168632271462126e-07, + "loss": 0.0174, + "num_tokens": 2758550034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9079115814628318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20448524702667822, + "kl": 0.5263671875, + "learning_rate": 5.149742680952108e-07, + "loss": 0.021, + "num_tokens": 2759114674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9080822736195272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12764165959782625, + "kl": 0.505859375, + "learning_rate": 5.130886758859555e-07, + "loss": 0.0203, + "num_tokens": 2759675842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9082529657762226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0887600652852666, + "kl": 0.5341796875, + "learning_rate": 5.112064511877602e-07, + "loss": 0.0214, + "num_tokens": 2760243522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.908423657932918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18508132317076914, + "kl": 0.6181640625, + "learning_rate": 5.093275946687492e-07, + "loss": 0.0248, + "num_tokens": 2760802642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9085943500896134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14412940132771684, + "kl": 0.501953125, + "learning_rate": 5.074521069958504e-07, + "loss": 0.0201, + "num_tokens": 2761374338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9087650422463088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3320445186064784, + "kl": 0.4912109375, + "learning_rate": 5.055799888347912e-07, + "loss": 0.0197, + "num_tokens": 2761937106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9089357344030042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.727171799944098, + "kl": 0.51806640625, + "learning_rate": 5.037112408501066e-07, + "loss": 0.0207, + "num_tokens": 2762506114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9091064265596995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22289062402593363, + "kl": 0.48486328125, + "learning_rate": 5.018458637051348e-07, + "loss": 0.0194, + "num_tokens": 2763070514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9092771187163949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33711916214491733, + "kl": 0.6025390625, + "learning_rate": 4.999838580620197e-07, + "loss": 0.0241, + "num_tokens": 2763634386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9094478108730903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2930773165376038, + "kl": 0.5751953125, + "learning_rate": 4.981252245817036e-07, + "loss": 0.023, + "num_tokens": 2764196786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9096185030297858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21730747715973303, + "kl": 0.48779296875, + "learning_rate": 4.962699639239333e-07, + "loss": 0.0195, + "num_tokens": 2764764226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9097891951864812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.376390956342516, + "kl": 0.5380859375, + "learning_rate": 4.944180767472606e-07, + "loss": 0.0215, + "num_tokens": 2765334834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9099598873431766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17693666881575498, + "kl": 0.537109375, + "learning_rate": 4.925695637090411e-07, + "loss": 0.0215, + "num_tokens": 2765898034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.910130579499872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0661144353317183, + "kl": 0.51220703125, + "learning_rate": 4.907244254654264e-07, + "loss": 0.0205, + "num_tokens": 2766463554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9103012716565674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.184412696856014, + "kl": 0.49169921875, + "learning_rate": 4.888826626713761e-07, + "loss": 0.0197, + "num_tokens": 2767029570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9104719638132628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.170903312004243, + "kl": 0.509765625, + "learning_rate": 4.870442759806482e-07, + "loss": 0.0203, + "num_tokens": 2767590242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9106426559699582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3259033602566727, + "kl": 0.51025390625, + "learning_rate": 4.852092660458052e-07, + "loss": 0.0204, + "num_tokens": 2768158498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9108133481266536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0850526842999114, + "kl": 0.46142578125, + "learning_rate": 4.833776335182084e-07, + "loss": 0.0185, + "num_tokens": 2768727138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.910984040283349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544314606340854, + "kl": 0.42578125, + "learning_rate": 4.815493790480208e-07, + "loss": 0.017, + "num_tokens": 2769302690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9111547324400444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11240802380459189, + "kl": 0.4326171875, + "learning_rate": 4.797245032842069e-07, + "loss": 0.0173, + "num_tokens": 2769871090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9113254245967398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24272571507867574, + "kl": 0.4716796875, + "learning_rate": 4.779030068745327e-07, + "loss": 0.0189, + "num_tokens": 2770431410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9114961167534352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41200704349544925, + "kl": 0.5419921875, + "learning_rate": 4.7608489046556463e-07, + "loss": 0.0216, + "num_tokens": 2771012930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9116668089101306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19381597225084876, + "kl": 0.53076171875, + "learning_rate": 4.7427015470266513e-07, + "loss": 0.0212, + "num_tokens": 2771574690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9118375010668259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7223882726627774, + "kl": 0.50146484375, + "learning_rate": 4.7245880023000123e-07, + "loss": 0.0201, + "num_tokens": 2772142418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9120081932235213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1188963735272312, + "kl": 0.47900390625, + "learning_rate": 4.706508276905386e-07, + "loss": 0.0192, + "num_tokens": 2772705730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9121788853802167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09870422728001396, + "kl": 0.45458984375, + "learning_rate": 4.6884623772604385e-07, + "loss": 0.0182, + "num_tokens": 2773270722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9123495775369121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10374806339775838, + "kl": 0.5048828125, + "learning_rate": 4.670450309770802e-07, + "loss": 0.0202, + "num_tokens": 2773830770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9125202696936076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22171390508777672, + "kl": 0.49853515625, + "learning_rate": 4.652472080830095e-07, + "loss": 0.0199, + "num_tokens": 2774399490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.912690961850303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1363345104985387, + "kl": 0.46826171875, + "learning_rate": 4.634527696819946e-07, + "loss": 0.0187, + "num_tokens": 2774969522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9128616540069984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.276565656747114, + "kl": 0.49462890625, + "learning_rate": 4.616617164109982e-07, + "loss": 0.0198, + "num_tokens": 2775539954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9130323461636938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18012583870365753, + "kl": 0.4755859375, + "learning_rate": 4.5987404890577846e-07, + "loss": 0.019, + "num_tokens": 2776108866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9132030383203892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6433781622741324, + "kl": 0.5146484375, + "learning_rate": 4.580897678008911e-07, + "loss": 0.0206, + "num_tokens": 2776676018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9133737304770846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3319361177353522, + "kl": 0.4990234375, + "learning_rate": 4.563088737296928e-07, + "loss": 0.02, + "num_tokens": 2777239250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.91354442263378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13925746380583004, + "kl": 0.4951171875, + "learning_rate": 4.545313673243379e-07, + "loss": 0.0198, + "num_tokens": 2777808274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9137151147904754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24011108823048796, + "kl": 0.53076171875, + "learning_rate": 4.5275724921577615e-07, + "loss": 0.0213, + "num_tokens": 2778390674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9138858069471708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3543047328173965, + "kl": 0.5869140625, + "learning_rate": 4.5098652003375486e-07, + "loss": 0.0235, + "num_tokens": 2778953426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9140564991038662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09089718483868524, + "kl": 0.5390625, + "learning_rate": 4.49219180406818e-07, + "loss": 0.0216, + "num_tokens": 2779523618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9142271912605616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07922031276967888, + "kl": 0.5634765625, + "learning_rate": 4.4745523096231145e-07, + "loss": 0.0226, + "num_tokens": 2780089346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.914397883417257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.124886708909327, + "kl": 0.5869140625, + "learning_rate": 4.4569467232636997e-07, + "loss": 0.0235, + "num_tokens": 2780658162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9145685755739523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2671690207982474, + "kl": 0.5830078125, + "learning_rate": 4.4393750512392806e-07, + "loss": 0.0233, + "num_tokens": 2781222786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9147392677306477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08418578225216536, + "kl": 0.5634765625, + "learning_rate": 4.4218372997871796e-07, + "loss": 0.0225, + "num_tokens": 2781795330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9149099598873431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2016167295324872, + "kl": 0.591796875, + "learning_rate": 4.404333475132672e-07, + "loss": 0.0237, + "num_tokens": 2782359282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9150806520440385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32954463527116934, + "kl": 0.5556640625, + "learning_rate": 4.386863583488965e-07, + "loss": 0.0223, + "num_tokens": 2782932498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.915251344200734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38660734624940396, + "kl": 0.52490234375, + "learning_rate": 4.369427631057266e-07, + "loss": 0.021, + "num_tokens": 2783502850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9154220363574294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1352289918164546, + "kl": 0.576171875, + "learning_rate": 4.352025624026679e-07, + "loss": 0.023, + "num_tokens": 2784071458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9155927285141248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0914032029230868, + "kl": 0.5634765625, + "learning_rate": 4.3346575685743074e-07, + "loss": 0.0225, + "num_tokens": 2784637250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9157634206708202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28960643367946065, + "kl": 0.57421875, + "learning_rate": 4.317323470865176e-07, + "loss": 0.023, + "num_tokens": 2785197506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9159341128275156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14139878443783846, + "kl": 0.6103515625, + "learning_rate": 4.3000233370522725e-07, + "loss": 0.0244, + "num_tokens": 2785763010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.916104804984211, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37818451254658286, + "kl": 0.63671875, + "learning_rate": 4.282757173276497e-07, + "loss": 0.0255, + "num_tokens": 2786330130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9162754971409064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10313268806337694, + "kl": 0.6396484375, + "learning_rate": 4.265524985666758e-07, + "loss": 0.0256, + "num_tokens": 2786891570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9164461892976018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3881980621733706, + "kl": 0.6181640625, + "learning_rate": 4.2483267803398063e-07, + "loss": 0.0247, + "num_tokens": 2787455730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9166168814542972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20729696271943895, + "kl": 0.6025390625, + "learning_rate": 4.231162563400426e-07, + "loss": 0.0241, + "num_tokens": 2788018546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9167875736109926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07672909432212931, + "kl": 0.630859375, + "learning_rate": 4.214032340941265e-07, + "loss": 0.0253, + "num_tokens": 2788578402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.916958265767688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36339094685284196, + "kl": 0.595703125, + "learning_rate": 4.1969361190429713e-07, + "loss": 0.0238, + "num_tokens": 2789143538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9171289579243834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15794018424894296, + "kl": 0.57421875, + "learning_rate": 4.1798739037740454e-07, + "loss": 0.0229, + "num_tokens": 2789704466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9172996500810787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09520794379557541, + "kl": 0.6123046875, + "learning_rate": 4.1628457011909874e-07, + "loss": 0.0245, + "num_tokens": 2790264258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9174703422377741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1229701724853722, + "kl": 0.5166015625, + "learning_rate": 4.145851517338162e-07, + "loss": 0.0207, + "num_tokens": 2790830002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9176410343944695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5094637973515339, + "kl": 0.59375, + "learning_rate": 4.128891358247933e-07, + "loss": 0.0238, + "num_tokens": 2791391474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9178117265511649, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26398538843141467, + "kl": 0.6044921875, + "learning_rate": 4.111965229940518e-07, + "loss": 0.0242, + "num_tokens": 2791953170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9179824187078603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12961534495840124, + "kl": 0.5380859375, + "learning_rate": 4.0950731384240885e-07, + "loss": 0.0215, + "num_tokens": 2792515218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9181531108645558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.335482313186217, + "kl": 0.568359375, + "learning_rate": 4.078215089694715e-07, + "loss": 0.0227, + "num_tokens": 2793083634.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9183238030212512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09167051156144868, + "kl": 0.5625, + "learning_rate": 4.061391089736422e-07, + "loss": 0.0225, + "num_tokens": 2793653442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9184944951779466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05398735047134629, + "kl": 0.49951171875, + "learning_rate": 4.0446011445210876e-07, + "loss": 0.02, + "num_tokens": 2794218466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.918665187334642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19609056316906928, + "kl": 0.54931640625, + "learning_rate": 4.0278452600085674e-07, + "loss": 0.0219, + "num_tokens": 2794778994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9188358794913374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1101661915201404, + "kl": 0.5166015625, + "learning_rate": 4.0111234421465917e-07, + "loss": 0.0207, + "num_tokens": 2795338482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9190065716480328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.157327837206441, + "kl": 0.50732421875, + "learning_rate": 3.994435696870791e-07, + "loss": 0.0203, + "num_tokens": 2795903874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9191772638047282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1499135622468296, + "kl": 0.4892578125, + "learning_rate": 3.9777820301047155e-07, + "loss": 0.0196, + "num_tokens": 2796464450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9193479559614236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2014994985712751, + "kl": 0.4892578125, + "learning_rate": 3.9611624477598034e-07, + "loss": 0.0196, + "num_tokens": 2797039378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.919518648118119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5640593332744133, + "kl": 0.49462890625, + "learning_rate": 3.9445769557354353e-07, + "loss": 0.0198, + "num_tokens": 2797616626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9196893402748144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2334160640609496, + "kl": 0.50927734375, + "learning_rate": 3.9280255599188466e-07, + "loss": 0.0204, + "num_tokens": 2798182226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9198600324315098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14660507390042016, + "kl": 0.47509765625, + "learning_rate": 3.911508266185171e-07, + "loss": 0.019, + "num_tokens": 2798746418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9200307245882051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28982536239336865, + "kl": 0.5361328125, + "learning_rate": 3.895025080397474e-07, + "loss": 0.0214, + "num_tokens": 2799305474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9202014167449005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2333456575784627, + "kl": 0.50732421875, + "learning_rate": 3.878576008406687e-07, + "loss": 0.0203, + "num_tokens": 2799875602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9203721089015959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22710017790483766, + "kl": 0.484375, + "learning_rate": 3.862161056051628e-07, + "loss": 0.0194, + "num_tokens": 2800441218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9205428010582913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14869003277842074, + "kl": 0.4833984375, + "learning_rate": 3.8457802291590127e-07, + "loss": 0.0193, + "num_tokens": 2801019250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9207134932149867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20240312771856742, + "kl": 0.4853515625, + "learning_rate": 3.8294335335434475e-07, + "loss": 0.0194, + "num_tokens": 2801582802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9208841853716822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3802823636427628, + "kl": 0.49072265625, + "learning_rate": 3.813120975007434e-07, + "loss": 0.0197, + "num_tokens": 2802147346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9210548775283776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3123327605477346, + "kl": 0.5126953125, + "learning_rate": 3.796842559341329e-07, + "loss": 0.0205, + "num_tokens": 2802710370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.921225569685073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.392124384564261, + "kl": 0.580078125, + "learning_rate": 3.7805982923233674e-07, + "loss": 0.0232, + "num_tokens": 2803276706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9213962618417684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21198969620898156, + "kl": 0.44775390625, + "learning_rate": 3.7643881797196803e-07, + "loss": 0.0179, + "num_tokens": 2803848482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9215669539984638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27495713167399793, + "kl": 0.48828125, + "learning_rate": 3.748212227284298e-07, + "loss": 0.0196, + "num_tokens": 2804417522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9217376461551592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13617646315612894, + "kl": 0.4990234375, + "learning_rate": 3.732070440759106e-07, + "loss": 0.02, + "num_tokens": 2804982898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9219083383118546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13456175339721113, + "kl": 0.5439453125, + "learning_rate": 3.715962825873798e-07, + "loss": 0.0217, + "num_tokens": 2805547522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.92207903046855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4300721344060381, + "kl": 0.55810546875, + "learning_rate": 3.699889388346045e-07, + "loss": 0.0223, + "num_tokens": 2806110418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9222497226252454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10955725691994303, + "kl": 0.45068359375, + "learning_rate": 3.6838501338813283e-07, + "loss": 0.018, + "num_tokens": 2806675394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9224204147819408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07258114416712447, + "kl": 0.5068359375, + "learning_rate": 3.667845068173004e-07, + "loss": 0.0203, + "num_tokens": 2807238386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9225911069386362, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.743044737411884, + "kl": 0.798828125, + "learning_rate": 3.651874196902294e-07, + "loss": 0.0319, + "num_tokens": 2807801314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9227617990953315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10440715875352395, + "kl": 0.5087890625, + "learning_rate": 3.6359375257382643e-07, + "loss": 0.0203, + "num_tokens": 2808370242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9229324912520269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12069453084464715, + "kl": 0.474609375, + "learning_rate": 3.6200350603378677e-07, + "loss": 0.019, + "num_tokens": 2808941906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9231031834087223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12341268818564677, + "kl": 0.4638671875, + "learning_rate": 3.60416680634591e-07, + "loss": 0.0186, + "num_tokens": 2809506578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9232738755654177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1465430757612218, + "kl": 0.52734375, + "learning_rate": 3.588332769395064e-07, + "loss": 0.0211, + "num_tokens": 2810082130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9234445677221131, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3032319930339882, + "kl": 0.5556640625, + "learning_rate": 3.5725329551058007e-07, + "loss": 0.0222, + "num_tokens": 2810646834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9236152598788085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2155839068121557, + "kl": 0.48876953125, + "learning_rate": 3.5567673690865114e-07, + "loss": 0.0195, + "num_tokens": 2811215170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.923785952035504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39647479100932276, + "kl": 0.5537109375, + "learning_rate": 3.541036016933419e-07, + "loss": 0.0221, + "num_tokens": 2811778434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9239566441921994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13668818778264372, + "kl": 0.5673828125, + "learning_rate": 3.525338904230569e-07, + "loss": 0.0227, + "num_tokens": 2812344082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9241273363488948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.512173567557047, + "kl": 0.5908203125, + "learning_rate": 3.509676036549858e-07, + "loss": 0.0236, + "num_tokens": 2812915474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9242980285055902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2609887181749649, + "kl": 0.5078125, + "learning_rate": 3.494047419451052e-07, + "loss": 0.0203, + "num_tokens": 2813478434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9244687206622856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31716710939368126, + "kl": 0.5595703125, + "learning_rate": 3.4784530584817676e-07, + "loss": 0.0224, + "num_tokens": 2814042866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.924639412818981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26323960903924826, + "kl": 0.48779296875, + "learning_rate": 3.462892959177411e-07, + "loss": 0.0195, + "num_tokens": 2814615794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9248101049756764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1420220713766399, + "kl": 0.46826171875, + "learning_rate": 3.447367127061263e-07, + "loss": 0.0187, + "num_tokens": 2815179746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9249807971323718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2502914790523827, + "kl": 0.4775390625, + "learning_rate": 3.431875567644427e-07, + "loss": 0.0191, + "num_tokens": 2815758770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9251514892890672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1511902566173227, + "kl": 0.45166015625, + "learning_rate": 3.416418286425871e-07, + "loss": 0.0181, + "num_tokens": 2816324754.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9253221814457626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1564550045522534, + "kl": 0.4501953125, + "learning_rate": 3.4009952888923506e-07, + "loss": 0.018, + "num_tokens": 2816893954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.925492873602458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11167729525253874, + "kl": 0.41650390625, + "learning_rate": 3.3856065805184746e-07, + "loss": 0.0167, + "num_tokens": 2817465666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9256635657591533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3104617887127913, + "kl": 0.431640625, + "learning_rate": 3.370252166766674e-07, + "loss": 0.0173, + "num_tokens": 2818033922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9258342579158487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23232647948522236, + "kl": 0.43505859375, + "learning_rate": 3.3549320530872453e-07, + "loss": 0.0174, + "num_tokens": 2818595026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9260049500725441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.112382724644888, + "kl": 0.42724609375, + "learning_rate": 3.3396462449182264e-07, + "loss": 0.0171, + "num_tokens": 2819155714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9261756422292395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2852817734499222, + "kl": 0.4072265625, + "learning_rate": 3.3243947476855664e-07, + "loss": 0.0163, + "num_tokens": 2819725106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9263463343859349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22317921237007532, + "kl": 0.412109375, + "learning_rate": 3.309177566802979e-07, + "loss": 0.0165, + "num_tokens": 2820285234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9265170265426304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13958077420449164, + "kl": 0.3994140625, + "learning_rate": 3.293994707672021e-07, + "loss": 0.016, + "num_tokens": 2820854242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9266877186993258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07405530348884379, + "kl": 0.40087890625, + "learning_rate": 3.278846175682049e-07, + "loss": 0.0161, + "num_tokens": 2821419826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9268584108560212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16935196402015104, + "kl": 0.39697265625, + "learning_rate": 3.2637319762102606e-07, + "loss": 0.0159, + "num_tokens": 2821984290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9270291030127166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08580209475652803, + "kl": 0.4111328125, + "learning_rate": 3.248652114621631e-07, + "loss": 0.0164, + "num_tokens": 2822550722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.927199795169412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0856088780714246, + "kl": 0.38720703125, + "learning_rate": 3.23360659626899e-07, + "loss": 0.0155, + "num_tokens": 2823118338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9273704873261074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06437238790446868, + "kl": 0.4033203125, + "learning_rate": 3.218595426492921e-07, + "loss": 0.0161, + "num_tokens": 2823680530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9275411794828028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1965667834457181, + "kl": 0.390625, + "learning_rate": 3.203618610621884e-07, + "loss": 0.0156, + "num_tokens": 2824244098.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9277118716394982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10574872888188218, + "kl": 0.42578125, + "learning_rate": 3.1886761539720813e-07, + "loss": 0.017, + "num_tokens": 2824815010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9278825637961936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1288160088716544, + "kl": 0.39453125, + "learning_rate": 3.1737680618475706e-07, + "loss": 0.0158, + "num_tokens": 2825386514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.928053255952889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09773788787432403, + "kl": 0.3701171875, + "learning_rate": 3.158894339540153e-07, + "loss": 0.0148, + "num_tokens": 2825950466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9282239481095844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13396299968287165, + "kl": 0.38818359375, + "learning_rate": 3.1440549923295036e-07, + "loss": 0.0155, + "num_tokens": 2826518770.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9283946402662797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07301607964135703, + "kl": 0.37353515625, + "learning_rate": 3.1292500254830106e-07, + "loss": 0.015, + "num_tokens": 2827080818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9285653324229751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10895033009471534, + "kl": 0.4189453125, + "learning_rate": 3.114479444255947e-07, + "loss": 0.0168, + "num_tokens": 2827644306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9287360245796705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10721721571276772, + "kl": 0.44580078125, + "learning_rate": 3.099743253891296e-07, + "loss": 0.0178, + "num_tokens": 2828207794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9289067167363659, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07438196153033236, + "kl": 0.38037109375, + "learning_rate": 3.0850414596198976e-07, + "loss": 0.0152, + "num_tokens": 2828776658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9290774088930613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10288636008122533, + "kl": 0.39453125, + "learning_rate": 3.0703740666603534e-07, + "loss": 0.0158, + "num_tokens": 2829343618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9292481010497567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1415680291533278, + "kl": 0.47412109375, + "learning_rate": 3.055741080219066e-07, + "loss": 0.019, + "num_tokens": 2829908562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9294187932064522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040845851898796234, + "kl": 0.3916015625, + "learning_rate": 3.041142505490191e-07, + "loss": 0.0157, + "num_tokens": 2830472466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9295894853631476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19117337328964293, + "kl": 0.4365234375, + "learning_rate": 3.0265783476557174e-07, + "loss": 0.0175, + "num_tokens": 2831040898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.929760177519843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06801434287701223, + "kl": 0.42724609375, + "learning_rate": 3.012048611885399e-07, + "loss": 0.0171, + "num_tokens": 2831607362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9299308696765384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05456490283260006, + "kl": 0.423828125, + "learning_rate": 2.997553303336753e-07, + "loss": 0.0169, + "num_tokens": 2832170498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9301015618332338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1859807169772305, + "kl": 0.478515625, + "learning_rate": 2.983092427155099e-07, + "loss": 0.0191, + "num_tokens": 2832737698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9302722539899292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09816619015089155, + "kl": 0.42919921875, + "learning_rate": 2.968665988473518e-07, + "loss": 0.0171, + "num_tokens": 2833311986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9304429461466246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13282431445649667, + "kl": 0.44287109375, + "learning_rate": 2.954273992412893e-07, + "loss": 0.0177, + "num_tokens": 2833878274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.93061363830332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18297332978885064, + "kl": 0.4453125, + "learning_rate": 2.9399164440818474e-07, + "loss": 0.0178, + "num_tokens": 2834452258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9307843304600154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15140881031150422, + "kl": 0.52099609375, + "learning_rate": 2.925593348576794e-07, + "loss": 0.0208, + "num_tokens": 2835018370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9309550226167108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14343338891522045, + "kl": 0.4326171875, + "learning_rate": 2.91130471098191e-07, + "loss": 0.0173, + "num_tokens": 2835587266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9311257147734061, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.358406077793681, + "kl": 0.6513671875, + "learning_rate": 2.897050536369161e-07, + "loss": 0.0261, + "num_tokens": 2836154610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9312964069301015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13633219697200746, + "kl": 0.453125, + "learning_rate": 2.8828308297982664e-07, + "loss": 0.0181, + "num_tokens": 2836719538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9314670990867969, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12292805224517717, + "kl": 0.45849609375, + "learning_rate": 2.868645596316677e-07, + "loss": 0.0183, + "num_tokens": 2837284466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9316377912434923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09801347940014081, + "kl": 0.47021484375, + "learning_rate": 2.854494840959665e-07, + "loss": 0.0188, + "num_tokens": 2837854226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9318084834001877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2058596700638415, + "kl": 0.45458984375, + "learning_rate": 2.840378568750235e-07, + "loss": 0.0182, + "num_tokens": 2838414834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9319791755568831, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21802851167784748, + "kl": 0.4794921875, + "learning_rate": 2.8262967846991787e-07, + "loss": 0.0192, + "num_tokens": 2838984802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9321498677135786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2083856721075041, + "kl": 0.54296875, + "learning_rate": 2.812249493804964e-07, + "loss": 0.0217, + "num_tokens": 2839551490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.932320559870274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10846459804831579, + "kl": 0.41845703125, + "learning_rate": 2.798236701053902e-07, + "loss": 0.0167, + "num_tokens": 2840114450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9324912520269694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13615804220289274, + "kl": 0.45458984375, + "learning_rate": 2.784258411420038e-07, + "loss": 0.0182, + "num_tokens": 2840677330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9326619441836648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20294697415903581, + "kl": 0.46044921875, + "learning_rate": 2.770314629865156e-07, + "loss": 0.0184, + "num_tokens": 2841244610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9328326363403602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1138498259605278, + "kl": 0.46826171875, + "learning_rate": 2.7564053613387873e-07, + "loss": 0.0187, + "num_tokens": 2841806402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9330033284970556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4507466694668992, + "kl": 0.4619140625, + "learning_rate": 2.7425306107782157e-07, + "loss": 0.0185, + "num_tokens": 2842369874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.933174020653751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10666077466096978, + "kl": 0.49658203125, + "learning_rate": 2.72869038310849e-07, + "loss": 0.0199, + "num_tokens": 2842941138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9333447128104464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24592690010826943, + "kl": 0.51171875, + "learning_rate": 2.7148846832423914e-07, + "loss": 0.0205, + "num_tokens": 2843508338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9335154049671418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22559805139490746, + "kl": 0.4404296875, + "learning_rate": 2.7011135160804335e-07, + "loss": 0.0176, + "num_tokens": 2844071522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9336860971238372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1147457099308207, + "kl": 0.47705078125, + "learning_rate": 2.687376886510884e-07, + "loss": 0.0191, + "num_tokens": 2844636882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9338567892805325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23757186458190277, + "kl": 0.46142578125, + "learning_rate": 2.6736747994097536e-07, + "loss": 0.0185, + "num_tokens": 2845200050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9340274814372279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4517563463816545, + "kl": 0.52197265625, + "learning_rate": 2.660007259640807e-07, + "loss": 0.0209, + "num_tokens": 2845765490.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9341981735939233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11163709678845682, + "kl": 0.42333984375, + "learning_rate": 2.6463742720555076e-07, + "loss": 0.0169, + "num_tokens": 2846331938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9343688657506187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13380598982379593, + "kl": 0.44580078125, + "learning_rate": 2.6327758414930627e-07, + "loss": 0.0178, + "num_tokens": 2846894082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9345395579073141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15866834318540154, + "kl": 0.484375, + "learning_rate": 2.6192119727804445e-07, + "loss": 0.0194, + "num_tokens": 2847468930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9347102500640095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16511227736463627, + "kl": 0.44482421875, + "learning_rate": 2.6056826707323454e-07, + "loss": 0.0178, + "num_tokens": 2848035474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.934880942220705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21582196708179696, + "kl": 0.45654296875, + "learning_rate": 2.5921879401511587e-07, + "loss": 0.0183, + "num_tokens": 2848601266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9350516343774004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19601401361938312, + "kl": 0.44873046875, + "learning_rate": 2.5787277858270285e-07, + "loss": 0.0179, + "num_tokens": 2849163970.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9352223265340958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2870418460152607, + "kl": 0.45849609375, + "learning_rate": 2.5653022125378234e-07, + "loss": 0.0183, + "num_tokens": 2849729474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9353930186907912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13779601595175076, + "kl": 0.48779296875, + "learning_rate": 2.5519112250491527e-07, + "loss": 0.0195, + "num_tokens": 2850299746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9355637108474866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12357943195074188, + "kl": 0.49267578125, + "learning_rate": 2.5385548281143257e-07, + "loss": 0.0197, + "num_tokens": 2850862690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.935734403004182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13851300770455713, + "kl": 0.4306640625, + "learning_rate": 2.525233026474361e-07, + "loss": 0.0172, + "num_tokens": 2851426498.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9359050951608774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10713789441653403, + "kl": 0.40283203125, + "learning_rate": 2.511945824858042e-07, + "loss": 0.0161, + "num_tokens": 2851991938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9360757873175728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07320387419608551, + "kl": 0.48095703125, + "learning_rate": 2.498693227981852e-07, + "loss": 0.0192, + "num_tokens": 2852558210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9362464794742682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2350585568901814, + "kl": 0.427734375, + "learning_rate": 2.4854752405499505e-07, + "loss": 0.0171, + "num_tokens": 2853126050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9364171716309636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1278300629705168, + "kl": 0.4296875, + "learning_rate": 2.472291867254284e-07, + "loss": 0.0172, + "num_tokens": 2853694002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9365878637876589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04146181926728447, + "kl": 0.4013671875, + "learning_rate": 2.4591431127744424e-07, + "loss": 0.0161, + "num_tokens": 2854265666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9367585559443543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0526826919824903, + "kl": 0.47021484375, + "learning_rate": 2.446028981777793e-07, + "loss": 0.0188, + "num_tokens": 2854830818.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9369292481010497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15133420483599205, + "kl": 0.42431640625, + "learning_rate": 2.4329494789193443e-07, + "loss": 0.017, + "num_tokens": 2855393666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9370999402577451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21388112404431953, + "kl": 0.40673828125, + "learning_rate": 2.419904608841861e-07, + "loss": 0.0163, + "num_tokens": 2855957778.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9372706324144405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9055677581708581, + "kl": 0.5087890625, + "learning_rate": 2.4068943761758055e-07, + "loss": 0.0204, + "num_tokens": 2856537218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9374413245711359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3091883480994676, + "kl": 0.47021484375, + "learning_rate": 2.39391878553934e-07, + "loss": 0.0188, + "num_tokens": 2857105010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9376120167278313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06051572377364517, + "kl": 0.41748046875, + "learning_rate": 2.3809778415383233e-07, + "loss": 0.0167, + "num_tokens": 2857675010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9377827088845268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15304273979730695, + "kl": 0.43896484375, + "learning_rate": 2.3680715487663374e-07, + "loss": 0.0176, + "num_tokens": 2858242482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9379534010412222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27405475887877667, + "kl": 0.5107421875, + "learning_rate": 2.3551999118046286e-07, + "loss": 0.0204, + "num_tokens": 2858805058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9381240931979176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14998527544483622, + "kl": 0.490234375, + "learning_rate": 2.3423629352221867e-07, + "loss": 0.0196, + "num_tokens": 2859366882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.938294785354613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4656112622320005, + "kl": 0.5458984375, + "learning_rate": 2.3295606235756662e-07, + "loss": 0.0218, + "num_tokens": 2859930514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9384654775113084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2845503584293792, + "kl": 0.49462890625, + "learning_rate": 2.3167929814094214e-07, + "loss": 0.0198, + "num_tokens": 2860495810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9386361696680038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2348755864951945, + "kl": 0.46533203125, + "learning_rate": 2.3040600132554937e-07, + "loss": 0.0186, + "num_tokens": 2861060002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9388068618246992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1291226149742505, + "kl": 0.4228515625, + "learning_rate": 2.291361723633656e-07, + "loss": 0.0169, + "num_tokens": 2861627106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9389775539813946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06063620287663941, + "kl": 0.4794921875, + "learning_rate": 2.2786981170513255e-07, + "loss": 0.0192, + "num_tokens": 2862195698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.93914824613809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2200895149187923, + "kl": 0.4912109375, + "learning_rate": 2.2660691980036176e-07, + "loss": 0.0197, + "num_tokens": 2862761074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9393189382947853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28828712078971336, + "kl": 0.4599609375, + "learning_rate": 2.2534749709733572e-07, + "loss": 0.0184, + "num_tokens": 2863328722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9394896304514807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8807682933277577, + "kl": 0.48046875, + "learning_rate": 2.240915440431046e-07, + "loss": 0.0192, + "num_tokens": 2863905074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9396603226081761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10065045531636438, + "kl": 0.47607421875, + "learning_rate": 2.2283906108348408e-07, + "loss": 0.019, + "num_tokens": 2864467602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9398310147648715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8020078639667576, + "kl": 0.490234375, + "learning_rate": 2.2159004866306068e-07, + "loss": 0.0196, + "num_tokens": 2865041922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9400017069215669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21497020768182887, + "kl": 0.46435546875, + "learning_rate": 2.2034450722519196e-07, + "loss": 0.0186, + "num_tokens": 2865610898.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9401723990782623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2930112633774277, + "kl": 0.509765625, + "learning_rate": 2.1910243721199765e-07, + "loss": 0.0204, + "num_tokens": 2866172370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9403430912349577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3273871579921771, + "kl": 0.50732421875, + "learning_rate": 2.1786383906436614e-07, + "loss": 0.0203, + "num_tokens": 2866741362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9405137833916531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26682993412571876, + "kl": 0.431640625, + "learning_rate": 2.16628713221958e-07, + "loss": 0.0173, + "num_tokens": 2867304834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9406844755483486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18081291520910753, + "kl": 0.47412109375, + "learning_rate": 2.1539706012319693e-07, + "loss": 0.019, + "num_tokens": 2867870082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.940855167705044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4018452846298848, + "kl": 0.4267578125, + "learning_rate": 2.141688802052766e-07, + "loss": 0.0171, + "num_tokens": 2868440690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9410258598617394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09443342475417728, + "kl": 0.4296875, + "learning_rate": 2.129441739041549e-07, + "loss": 0.0172, + "num_tokens": 2869007138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9411965520184348, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.3132863173174565, + "kl": 0.791015625, + "learning_rate": 2.1172294165455852e-07, + "loss": 0.0316, + "num_tokens": 2869576210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9413672441751302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07659318164245729, + "kl": 0.41455078125, + "learning_rate": 2.1050518388998188e-07, + "loss": 0.0166, + "num_tokens": 2870141906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9415379363318256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22613853589613298, + "kl": 0.42822265625, + "learning_rate": 2.092909010426847e-07, + "loss": 0.0171, + "num_tokens": 2870709842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.941708628488521, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2018527303236601, + "kl": 0.42236328125, + "learning_rate": 2.0808009354369328e-07, + "loss": 0.0169, + "num_tokens": 2871281138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9418793206452164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1849322528868082, + "kl": 0.46875, + "learning_rate": 2.0687276182279948e-07, + "loss": 0.0188, + "num_tokens": 2871864354.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9420500128019118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2719811018770675, + "kl": 0.4716796875, + "learning_rate": 2.0566890630856594e-07, + "loss": 0.0189, + "num_tokens": 2872430690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9422207049586071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08601256433454282, + "kl": 0.48583984375, + "learning_rate": 2.0446852742831423e-07, + "loss": 0.0195, + "num_tokens": 2872994914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9423913971153025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10378681921264535, + "kl": 0.43017578125, + "learning_rate": 2.0327162560813685e-07, + "loss": 0.0172, + "num_tokens": 2873565986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9425620892719979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16455438239189785, + "kl": 0.44482421875, + "learning_rate": 2.0207820127289057e-07, + "loss": 0.0178, + "num_tokens": 2874138434.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9427327814286933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19352515248936938, + "kl": 0.44482421875, + "learning_rate": 2.008882548461988e-07, + "loss": 0.0178, + "num_tokens": 2874703394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9429034735853887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05449145139135621, + "kl": 0.4091796875, + "learning_rate": 1.9970178675044916e-07, + "loss": 0.0163, + "num_tokens": 2875267042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9430741657420841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15999336161420544, + "kl": 0.4052734375, + "learning_rate": 1.985187974067937e-07, + "loss": 0.0162, + "num_tokens": 2875835474.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9432448578987795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24947744980653824, + "kl": 0.43017578125, + "learning_rate": 1.9733928723515204e-07, + "loss": 0.0172, + "num_tokens": 2876399746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.943415550055475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17201376593267417, + "kl": 0.4267578125, + "learning_rate": 1.9616325665420932e-07, + "loss": 0.0171, + "num_tokens": 2876966690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9435862422121704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14431419490400274, + "kl": 0.43603515625, + "learning_rate": 1.9499070608141048e-07, + "loss": 0.0174, + "num_tokens": 2877530050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9437569343688658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25840618870100285, + "kl": 0.4716796875, + "learning_rate": 1.9382163593297143e-07, + "loss": 0.0189, + "num_tokens": 2878093394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9439276265255612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25542007748332785, + "kl": 0.46142578125, + "learning_rate": 1.9265604662386804e-07, + "loss": 0.0184, + "num_tokens": 2878657842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9440983186822566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12371804229486785, + "kl": 0.4228515625, + "learning_rate": 1.9149393856784493e-07, + "loss": 0.0169, + "num_tokens": 2879220738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.944269010838952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25169998937977156, + "kl": 0.43212890625, + "learning_rate": 1.9033531217740542e-07, + "loss": 0.0173, + "num_tokens": 2879783410.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9444397029956474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2776352079821296, + "kl": 0.46484375, + "learning_rate": 1.8918016786382276e-07, + "loss": 0.0186, + "num_tokens": 2880344738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9446103951523428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17791706491078887, + "kl": 0.4384765625, + "learning_rate": 1.8802850603712898e-07, + "loss": 0.0175, + "num_tokens": 2880908178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9447810873090382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12555184033794825, + "kl": 0.44287109375, + "learning_rate": 1.868803271061248e-07, + "loss": 0.0177, + "num_tokens": 2881475426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9449517794657335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21462440606179836, + "kl": 0.46044921875, + "learning_rate": 1.8573563147836983e-07, + "loss": 0.0184, + "num_tokens": 2882040370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9451224716224289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4795114542789336, + "kl": 0.53759765625, + "learning_rate": 1.845944195601923e-07, + "loss": 0.0215, + "num_tokens": 2882605010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9452931637791243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.890927862477555, + "kl": 0.4775390625, + "learning_rate": 1.8345669175667935e-07, + "loss": 0.0191, + "num_tokens": 2883166594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9454638559358197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13084455102895254, + "kl": 0.486328125, + "learning_rate": 1.8232244847168456e-07, + "loss": 0.0195, + "num_tokens": 2883739362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9456345480925151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17853467036519358, + "kl": 0.4052734375, + "learning_rate": 1.8119169010782257e-07, + "loss": 0.0162, + "num_tokens": 2884306274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9458052402492105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28464727158797176, + "kl": 0.50634765625, + "learning_rate": 1.8006441706647225e-07, + "loss": 0.0203, + "num_tokens": 2884869170.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9459759324059059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18793904363219055, + "kl": 0.43212890625, + "learning_rate": 1.789406297477736e-07, + "loss": 0.0173, + "num_tokens": 2885438546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9461466245626013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8039538169099381, + "kl": 0.4970703125, + "learning_rate": 1.7782032855063302e-07, + "loss": 0.0199, + "num_tokens": 2886016130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9463173167192968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2551684643510148, + "kl": 0.46044921875, + "learning_rate": 1.7670351387271356e-07, + "loss": 0.0184, + "num_tokens": 2886582626.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9464880088759922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1156547060573907, + "kl": 0.537109375, + "learning_rate": 1.7559018611044697e-07, + "loss": 0.0215, + "num_tokens": 2887148338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9466587010326876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1282020269123412, + "kl": 0.5, + "learning_rate": 1.7448034565902384e-07, + "loss": 0.02, + "num_tokens": 2887714610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.946829393189383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06283827627713165, + "kl": 0.47705078125, + "learning_rate": 1.7337399291239676e-07, + "loss": 0.0191, + "num_tokens": 2888284290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9470000853460784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11327089983608969, + "kl": 0.46435546875, + "learning_rate": 1.7227112826328162e-07, + "loss": 0.0186, + "num_tokens": 2888846930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9471707775027738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08937972700698649, + "kl": 0.44140625, + "learning_rate": 1.7117175210315418e-07, + "loss": 0.0177, + "num_tokens": 2889410962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9473414696594692, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.4804158957997378, + "kl": 0.6123046875, + "learning_rate": 1.7007586482225557e-07, + "loss": 0.0245, + "num_tokens": 2889972450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9475121618161646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.655638524908569, + "kl": 0.50146484375, + "learning_rate": 1.6898346680958577e-07, + "loss": 0.02, + "num_tokens": 2890535698.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9476828539728599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6084976611196952, + "kl": 0.5625, + "learning_rate": 1.678945584529046e-07, + "loss": 0.0225, + "num_tokens": 2891102386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9478535461295553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21795848937291756, + "kl": 0.45849609375, + "learning_rate": 1.6680914013873616e-07, + "loss": 0.0183, + "num_tokens": 2891671394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9480242382862507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20002279662619135, + "kl": 0.44580078125, + "learning_rate": 1.6572721225236676e-07, + "loss": 0.0178, + "num_tokens": 2892239010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9481949304429461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21211283378589987, + "kl": 0.4658203125, + "learning_rate": 1.6464877517784027e-07, + "loss": 0.0186, + "num_tokens": 2892804066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9483656225996415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.40695227677122253, + "kl": 0.4990234375, + "learning_rate": 1.6357382929796162e-07, + "loss": 0.02, + "num_tokens": 2893373106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9485363147563369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08341306073287054, + "kl": 0.3896484375, + "learning_rate": 1.6250237499429888e-07, + "loss": 0.0156, + "num_tokens": 2893941522.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9487070069130323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37025528800554725, + "kl": 0.46533203125, + "learning_rate": 1.6143441264718008e-07, + "loss": 0.0186, + "num_tokens": 2894513730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9488776990697277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17992662074807986, + "kl": 0.42138671875, + "learning_rate": 1.6036994263569307e-07, + "loss": 0.0168, + "num_tokens": 2895083218.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9490483912264231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17379494855398597, + "kl": 0.45068359375, + "learning_rate": 1.5930896533768559e-07, + "loss": 0.018, + "num_tokens": 2895648450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9492190833831186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09964494722235899, + "kl": 0.43896484375, + "learning_rate": 1.5825148112976752e-07, + "loss": 0.0176, + "num_tokens": 2896216338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.949389775539814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1870091068596537, + "kl": 0.41748046875, + "learning_rate": 1.5719749038730635e-07, + "loss": 0.0167, + "num_tokens": 2896786418.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9495604676965094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2910615849750252, + "kl": 0.421875, + "learning_rate": 1.5614699348443286e-07, + "loss": 0.0169, + "num_tokens": 2897348370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9497311598532048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28689366371855124, + "kl": 0.4306640625, + "learning_rate": 1.5509999079403314e-07, + "loss": 0.0172, + "num_tokens": 2897914258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9499018520099002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2330629824978367, + "kl": 0.42236328125, + "learning_rate": 1.5405648268775554e-07, + "loss": 0.0169, + "num_tokens": 2898473058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9500725441665956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3335743310860663, + "kl": 0.4609375, + "learning_rate": 1.5301646953600812e-07, + "loss": 0.0184, + "num_tokens": 2899039298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.950243236323291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12117366630450861, + "kl": 0.4130859375, + "learning_rate": 1.5197995170795897e-07, + "loss": 0.0165, + "num_tokens": 2899605714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9504139284799863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08525035101476684, + "kl": 0.421875, + "learning_rate": 1.509469295715349e-07, + "loss": 0.0169, + "num_tokens": 2900162738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9505846206366817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17330847960831228, + "kl": 0.404296875, + "learning_rate": 1.499174034934192e-07, + "loss": 0.0162, + "num_tokens": 2900732034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9507553127933771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1684604242684308, + "kl": 0.41015625, + "learning_rate": 1.4889137383905737e-07, + "loss": 0.0164, + "num_tokens": 2901303090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9509260049500725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154404088513675, + "kl": 0.45166015625, + "learning_rate": 1.4786884097265475e-07, + "loss": 0.0181, + "num_tokens": 2901867746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9510966971067679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15483274909756428, + "kl": 0.46630859375, + "learning_rate": 1.4684980525717096e-07, + "loss": 0.0186, + "num_tokens": 2902433938.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9512673892634633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31871411439020175, + "kl": 0.45947265625, + "learning_rate": 1.458342670543278e-07, + "loss": 0.0184, + "num_tokens": 2903000610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9514380814201587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203159269457377, + "kl": 0.400390625, + "learning_rate": 1.4482222672460466e-07, + "loss": 0.016, + "num_tokens": 2903566210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9516087735768541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043785530973257845, + "kl": 0.44091796875, + "learning_rate": 1.4381368462724088e-07, + "loss": 0.0176, + "num_tokens": 2904132978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9517794657335495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20118069146629325, + "kl": 0.39013671875, + "learning_rate": 1.4280864112023118e-07, + "loss": 0.0156, + "num_tokens": 2904700290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.951950157890245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3236994114493251, + "kl": 0.455078125, + "learning_rate": 1.41807096560328e-07, + "loss": 0.0182, + "num_tokens": 2905265090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9521208500469404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23481883908433956, + "kl": 0.42431640625, + "learning_rate": 1.408090513030469e-07, + "loss": 0.017, + "num_tokens": 2905833362.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9522915422036358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39599908104601955, + "kl": 0.4921875, + "learning_rate": 1.3981450570265564e-07, + "loss": 0.0197, + "num_tokens": 2906402866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9524622343603312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11018723154155143, + "kl": 0.43505859375, + "learning_rate": 1.3882346011218295e-07, + "loss": 0.0174, + "num_tokens": 2906974114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9526329265170266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09446531384868026, + "kl": 0.427734375, + "learning_rate": 1.3783591488341296e-07, + "loss": 0.0171, + "num_tokens": 2907542642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.952803618673722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12963470805560248, + "kl": 0.4462890625, + "learning_rate": 1.3685187036688975e-07, + "loss": 0.0178, + "num_tokens": 2908109586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9529743108304174, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3200233507140635, + "kl": 0.54443359375, + "learning_rate": 1.3587132691191385e-07, + "loss": 0.0218, + "num_tokens": 2908676338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9531450029871127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08682755271548999, + "kl": 0.4091796875, + "learning_rate": 1.3489428486654243e-07, + "loss": 0.0164, + "num_tokens": 2909242274.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9533156951438081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16623849496012774, + "kl": 0.42529296875, + "learning_rate": 1.339207445775892e-07, + "loss": 0.017, + "num_tokens": 2909809090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9534863873005035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11654154133083622, + "kl": 0.4619140625, + "learning_rate": 1.3295070639062658e-07, + "loss": 0.0185, + "num_tokens": 2910374242.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9536570794571989, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1773563328722415, + "kl": 0.44677734375, + "learning_rate": 1.319841706499847e-07, + "loss": 0.0179, + "num_tokens": 2910940210.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9538277716138943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08707711871077974, + "kl": 0.43505859375, + "learning_rate": 1.3102113769874581e-07, + "loss": 0.0174, + "num_tokens": 2911507618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9539984637705897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15747885151177302, + "kl": 0.48828125, + "learning_rate": 1.3006160787875422e-07, + "loss": 0.0195, + "num_tokens": 2912072130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9541691559272851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1030062065758513, + "kl": 0.3955078125, + "learning_rate": 1.2910558153060638e-07, + "loss": 0.0158, + "num_tokens": 2912639922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9543398480839805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26098085145098315, + "kl": 0.40771484375, + "learning_rate": 1.2815305899365972e-07, + "loss": 0.0163, + "num_tokens": 2913199762.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9545105402406759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12479636571889507, + "kl": 0.4453125, + "learning_rate": 1.2720404060602266e-07, + "loss": 0.0178, + "num_tokens": 2913766834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9546812323973713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07363753176838665, + "kl": 0.38232421875, + "learning_rate": 1.2625852670456462e-07, + "loss": 0.0153, + "num_tokens": 2914334946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9548519245540668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2836178822733197, + "kl": 0.42333984375, + "learning_rate": 1.2531651762490716e-07, + "loss": 0.0169, + "num_tokens": 2914926914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9550226167107622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06568947209437176, + "kl": 0.46435546875, + "learning_rate": 1.2437801370143166e-07, + "loss": 0.0186, + "num_tokens": 2915488402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9551933088674576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13587480667222554, + "kl": 0.4208984375, + "learning_rate": 1.2344301526727053e-07, + "loss": 0.0168, + "num_tokens": 2916053442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.955364001024153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18776053870081502, + "kl": 0.44921875, + "learning_rate": 1.2251152265431608e-07, + "loss": 0.0179, + "num_tokens": 2916615794.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9555346931808484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12162730768667525, + "kl": 0.44384765625, + "learning_rate": 1.2158353619321382e-07, + "loss": 0.0177, + "num_tokens": 2917182946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9557053853375438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15335676562486278, + "kl": 0.40087890625, + "learning_rate": 1.2065905621336692e-07, + "loss": 0.016, + "num_tokens": 2917747330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9558760774942391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14736249139772079, + "kl": 0.43310546875, + "learning_rate": 1.1973808304293067e-07, + "loss": 0.0173, + "num_tokens": 2918310882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9560467696509345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25505666721524106, + "kl": 0.4306640625, + "learning_rate": 1.1882061700881908e-07, + "loss": 0.0172, + "num_tokens": 2918879314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9562174618076299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2568413034382073, + "kl": 0.39013671875, + "learning_rate": 1.1790665843669614e-07, + "loss": 0.0156, + "num_tokens": 2919447554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9563881539643253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13865745725953282, + "kl": 0.44677734375, + "learning_rate": 1.1699620765098785e-07, + "loss": 0.0179, + "num_tokens": 2920016914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9565588461210207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2740485972219663, + "kl": 0.4169921875, + "learning_rate": 1.1608926497487017e-07, + "loss": 0.0167, + "num_tokens": 2920578738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9567295382777161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5124454611634285, + "kl": 0.396484375, + "learning_rate": 1.1518583073027334e-07, + "loss": 0.0159, + "num_tokens": 2921142738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9569002304344115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14852079250337014, + "kl": 0.43115234375, + "learning_rate": 1.142859052378864e-07, + "loss": 0.0173, + "num_tokens": 2921710386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9570709225911069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34940874455205206, + "kl": 0.40087890625, + "learning_rate": 1.133894888171494e-07, + "loss": 0.016, + "num_tokens": 2922276642.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9572416147478023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10489359516702865, + "kl": 0.37109375, + "learning_rate": 1.1249658178625777e-07, + "loss": 0.0148, + "num_tokens": 2922843730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9574123069044977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1035494338598285, + "kl": 0.40234375, + "learning_rate": 1.1160718446216023e-07, + "loss": 0.0161, + "num_tokens": 2923407122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9575829990611932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2271931235646574, + "kl": 0.4375, + "learning_rate": 1.1072129716056312e-07, + "loss": 0.0175, + "num_tokens": 2923974482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9577536912178886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12385284311717103, + "kl": 0.4072265625, + "learning_rate": 1.0983892019592269e-07, + "loss": 0.0163, + "num_tokens": 2924547282.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.957924383374584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3044872642494583, + "kl": 0.4140625, + "learning_rate": 1.089600538814506e-07, + "loss": 0.0166, + "num_tokens": 2925108226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9580950755312794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1712142479601402, + "kl": 0.41943359375, + "learning_rate": 1.0808469852911285e-07, + "loss": 0.0168, + "num_tokens": 2925677874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9582657676879748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0990836054304362, + "kl": 0.4111328125, + "learning_rate": 1.0721285444963092e-07, + "loss": 0.0164, + "num_tokens": 2926240514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9584364598446702, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1211106511909303, + "kl": 0.44921875, + "learning_rate": 1.0634452195247613e-07, + "loss": 0.018, + "num_tokens": 2926804914.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9586071520013656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11562307570789364, + "kl": 0.42431640625, + "learning_rate": 1.0547970134587527e-07, + "loss": 0.017, + "num_tokens": 2927375602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9587778441580609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12915884661598637, + "kl": 0.4609375, + "learning_rate": 1.0461839293680831e-07, + "loss": 0.0184, + "num_tokens": 2927937986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9589485363147563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1847585858315867, + "kl": 0.4208984375, + "learning_rate": 1.037605970310096e-07, + "loss": 0.0168, + "num_tokens": 2928507570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9591192284714517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14788446390208623, + "kl": 0.3740234375, + "learning_rate": 1.0290631393296557e-07, + "loss": 0.015, + "num_tokens": 2929069730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9592899206281471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11110654994524541, + "kl": 0.36376953125, + "learning_rate": 1.0205554394591589e-07, + "loss": 0.0145, + "num_tokens": 2929640370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9594606127848425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1605115823448177, + "kl": 0.42236328125, + "learning_rate": 1.0120828737185118e-07, + "loss": 0.0169, + "num_tokens": 2930210722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9596313049415379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16847772131049565, + "kl": 0.39404296875, + "learning_rate": 1.0036454451151978e-07, + "loss": 0.0158, + "num_tokens": 2930782850.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9598019970982333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11530500055919542, + "kl": 0.39404296875, + "learning_rate": 9.952431566441877e-08, + "loss": 0.0158, + "num_tokens": 2931348994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9599726892549287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08269644718295471, + "kl": 0.4140625, + "learning_rate": 9.86876011287996e-08, + "loss": 0.0166, + "num_tokens": 2931917314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9601433814116241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4217030707285245, + "kl": 0.41796875, + "learning_rate": 9.785440120166356e-08, + "loss": 0.0167, + "num_tokens": 2932509378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9603140735683195, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.715960062214726, + "kl": 0.61474609375, + "learning_rate": 9.702471617876851e-08, + "loss": 0.0245, + "num_tokens": 2933077026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.960484765725015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1289618027999331, + "kl": 0.42578125, + "learning_rate": 9.619854635462445e-08, + "loss": 0.017, + "num_tokens": 2933643010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9606554578817104, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13869891998989806, + "kl": 0.4189453125, + "learning_rate": 9.537589202248788e-08, + "loss": 0.0167, + "num_tokens": 2934207058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9608261500384058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15936722026850164, + "kl": 0.43896484375, + "learning_rate": 9.455675347437299e-08, + "loss": 0.0175, + "num_tokens": 2934778978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9609968421951012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2833520001328336, + "kl": 0.4677734375, + "learning_rate": 9.374113100104498e-08, + "loss": 0.0187, + "num_tokens": 2935341234.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9611675343517966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5829693367522043, + "kl": 0.46728515625, + "learning_rate": 9.292902489202005e-08, + "loss": 0.0187, + "num_tokens": 2935906594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.961338226508492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11125119273169456, + "kl": 0.40771484375, + "learning_rate": 9.212043543556759e-08, + "loss": 0.0163, + "num_tokens": 2936475298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9615089186651873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10424423476696063, + "kl": 0.41064453125, + "learning_rate": 9.13153629187058e-08, + "loss": 0.0164, + "num_tokens": 2937037186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9616796108218827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04445641700734058, + "kl": 0.3994140625, + "learning_rate": 9.051380762720607e-08, + "loss": 0.016, + "num_tokens": 2937602386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9618503029785781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.075165063197082, + "kl": 0.4345703125, + "learning_rate": 8.971576984559416e-08, + "loss": 0.0174, + "num_tokens": 2938164738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9620209951352735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14000146086262716, + "kl": 0.43359375, + "learning_rate": 8.892124985714346e-08, + "loss": 0.0174, + "num_tokens": 2938730114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9621916872919689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14936173154804586, + "kl": 0.43701171875, + "learning_rate": 8.813024794387947e-08, + "loss": 0.0175, + "num_tokens": 2939288002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9623623794486643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23996117929058142, + "kl": 0.4326171875, + "learning_rate": 8.734276438657874e-08, + "loss": 0.0173, + "num_tokens": 2939856370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9625330716053597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27520590052336763, + "kl": 0.43115234375, + "learning_rate": 8.655879946477097e-08, + "loss": 0.0172, + "num_tokens": 2940415330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9627037637620551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10207013229607607, + "kl": 0.38671875, + "learning_rate": 8.577835345673468e-08, + "loss": 0.0155, + "num_tokens": 2940980834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9628744559187505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14674702689622265, + "kl": 0.4033203125, + "learning_rate": 8.50014266394994e-08, + "loss": 0.0162, + "num_tokens": 2941545330.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.963045148075446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19130089588267513, + "kl": 0.41748046875, + "learning_rate": 8.422801928884671e-08, + "loss": 0.0167, + "num_tokens": 2942107874.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9632158402321414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08155093769050874, + "kl": 0.4189453125, + "learning_rate": 8.345813167930927e-08, + "loss": 0.0167, + "num_tokens": 2942679026.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9633865323888368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15852137939724129, + "kl": 0.43212890625, + "learning_rate": 8.269176408416846e-08, + "loss": 0.0173, + "num_tokens": 2943246738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9635572245455322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24991355734729953, + "kl": 0.5048828125, + "learning_rate": 8.192891677545667e-08, + "loss": 0.0202, + "num_tokens": 2943811714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9637279167022276, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.208722521062343, + "kl": 0.54541015625, + "learning_rate": 8.11695900239573e-08, + "loss": 0.0218, + "num_tokens": 2944374994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.963898608858923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13360119992868386, + "kl": 0.42529296875, + "learning_rate": 8.041378409920476e-08, + "loss": 0.017, + "num_tokens": 2944943186.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9640693010156184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13158165453225285, + "kl": 0.44140625, + "learning_rate": 7.96614992694822e-08, + "loss": 0.0177, + "num_tokens": 2945524290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9642399931723137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22876631525181548, + "kl": 0.43896484375, + "learning_rate": 7.891273580182379e-08, + "loss": 0.0176, + "num_tokens": 2946087842.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9644106853290091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1432942861978798, + "kl": 0.41845703125, + "learning_rate": 7.816749396201362e-08, + "loss": 0.0168, + "num_tokens": 2946659650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9645813774857045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13103710484225997, + "kl": 0.45947265625, + "learning_rate": 7.742577401458562e-08, + "loss": 0.0184, + "num_tokens": 2947224514.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9647520696423999, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06792746349704448, + "kl": 0.42236328125, + "learning_rate": 7.668757622282253e-08, + "loss": 0.0169, + "num_tokens": 2947795378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9649227617990953, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.518694460985342, + "kl": 0.482421875, + "learning_rate": 7.595290084876029e-08, + "loss": 0.0193, + "num_tokens": 2948358722.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9650934539557907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04498298784958209, + "kl": 0.43603515625, + "learning_rate": 7.522174815318028e-08, + "loss": 0.0174, + "num_tokens": 2948922258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9652641461124861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23893203969725405, + "kl": 0.42724609375, + "learning_rate": 7.449411839561494e-08, + "loss": 0.0171, + "num_tokens": 2949504146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9654348382691815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12366978504944591, + "kl": 0.4580078125, + "learning_rate": 7.377001183434761e-08, + "loss": 0.0183, + "num_tokens": 2950076530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9656055304258769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11084449260638815, + "kl": 0.39501953125, + "learning_rate": 7.304942872641052e-08, + "loss": 0.0158, + "num_tokens": 2950640802.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9657762225825723, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.556190682028261, + "kl": 0.43603515625, + "learning_rate": 7.233236932758347e-08, + "loss": 0.0174, + "num_tokens": 2951210882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9659469147392677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10809481129667109, + "kl": 0.41650390625, + "learning_rate": 7.161883389239732e-08, + "loss": 0.0167, + "num_tokens": 2951781122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9661176068959632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33722691742788824, + "kl": 0.43603515625, + "learning_rate": 7.090882267413058e-08, + "loss": 0.0174, + "num_tokens": 2952346306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9662882990526586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20617561432809273, + "kl": 0.39501953125, + "learning_rate": 7.020233592481273e-08, + "loss": 0.0158, + "num_tokens": 2952910882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.966458991209354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23483193590810936, + "kl": 0.419921875, + "learning_rate": 6.949937389521988e-08, + "loss": 0.0168, + "num_tokens": 2953476450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9666296833660494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16400879844435487, + "kl": 0.4072265625, + "learning_rate": 6.879993683488018e-08, + "loss": 0.0163, + "num_tokens": 2954044978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9668003755227448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12959675323362402, + "kl": 0.4326171875, + "learning_rate": 6.810402499206614e-08, + "loss": 0.0173, + "num_tokens": 2954608546.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9669710676794401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18223367396970946, + "kl": 0.43994140625, + "learning_rate": 6.74116386138024e-08, + "loss": 0.0176, + "num_tokens": 2955168146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9671417598361355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17046990516361887, + "kl": 0.41748046875, + "learning_rate": 6.672277794586124e-08, + "loss": 0.0167, + "num_tokens": 2955732258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9673124519928309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08805523633138096, + "kl": 0.380859375, + "learning_rate": 6.603744323276262e-08, + "loss": 0.0152, + "num_tokens": 2956298114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9674831441495263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1249840590031572, + "kl": 0.408203125, + "learning_rate": 6.53556347177764e-08, + "loss": 0.0163, + "num_tokens": 2956862338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9676538363062217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.261126119802252, + "kl": 0.42626953125, + "learning_rate": 6.467735264292008e-08, + "loss": 0.0171, + "num_tokens": 2957425746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9678245284629171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17151435819306637, + "kl": 0.42724609375, + "learning_rate": 6.400259724895996e-08, + "loss": 0.0171, + "num_tokens": 2957984402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9679952206196125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2553308592164162, + "kl": 0.470703125, + "learning_rate": 6.33313687754078e-08, + "loss": 0.0188, + "num_tokens": 2958553458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9681659127763079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09877169886693977, + "kl": 0.4072265625, + "learning_rate": 6.266366746052633e-08, + "loss": 0.0163, + "num_tokens": 2959118786.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9683366049330033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11473354585915639, + "kl": 0.4169921875, + "learning_rate": 6.199949354132595e-08, + "loss": 0.0167, + "num_tokens": 2959685090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9685072970896987, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.6265898006724527, + "kl": 0.41259765625, + "learning_rate": 6.133884725356476e-08, + "loss": 0.0165, + "num_tokens": 2960249666.0, + "reward": 0.0009765625, + "reward_std": 0.00390625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0009765625, + "rewards/tag_count_reward/std": 0.015625, + "step": 5674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9686779892463941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7499937761730598, + "kl": 0.470703125, + "learning_rate": 6.068172883174738e-08, + "loss": 0.0189, + "num_tokens": 2960815602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9688486814030896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07769578736558293, + "kl": 0.38037109375, + "learning_rate": 6.002813850912726e-08, + "loss": 0.0152, + "num_tokens": 2961386450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.969019373559785, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3644909409245454, + "kl": 0.451171875, + "learning_rate": 5.937807651770544e-08, + "loss": 0.0181, + "num_tokens": 2961949042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9691900657164804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05409715011977937, + "kl": 0.4384765625, + "learning_rate": 5.87315430882307e-08, + "loss": 0.0175, + "num_tokens": 2962516994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9693607578731758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08909720935273839, + "kl": 0.41455078125, + "learning_rate": 5.8088538450199464e-08, + "loss": 0.0166, + "num_tokens": 2963085506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9695314500298712, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11545564971033462, + "kl": 0.38134765625, + "learning_rate": 5.744906283185469e-08, + "loss": 0.0153, + "num_tokens": 2963653922.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9697021421865665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08690046548568613, + "kl": 0.38525390625, + "learning_rate": 5.6813116460185944e-08, + "loss": 0.0154, + "num_tokens": 2964223346.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9698728343432619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.127964276101344, + "kl": 0.43359375, + "learning_rate": 5.618069956093375e-08, + "loss": 0.0173, + "num_tokens": 2964789090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9700435264999573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24860820175244883, + "kl": 0.40234375, + "learning_rate": 5.555181235858187e-08, + "loss": 0.0161, + "num_tokens": 2965362978.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9702142186566527, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.541113901201632, + "kl": 0.48486328125, + "learning_rate": 5.492645507636174e-08, + "loss": 0.0194, + "num_tokens": 2965932018.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9703849108133481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08289131692702885, + "kl": 0.3935546875, + "learning_rate": 5.4304627936254685e-08, + "loss": 0.0157, + "num_tokens": 2966492306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9705556029700435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15607141764991597, + "kl": 0.41748046875, + "learning_rate": 5.368633115898414e-08, + "loss": 0.0167, + "num_tokens": 2967062338.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9707262951267389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12273797411451343, + "kl": 0.39013671875, + "learning_rate": 5.3071564964026766e-08, + "loss": 0.0156, + "num_tokens": 2967630082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9708969872834343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3230038043459323, + "kl": 0.40625, + "learning_rate": 5.246032956959912e-08, + "loss": 0.0162, + "num_tokens": 2968196674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9710676794401297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28767644550399823, + "kl": 0.4375, + "learning_rate": 5.185262519266876e-08, + "loss": 0.0175, + "num_tokens": 2968763538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9712383715968251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03665098948143799, + "kl": 0.38037109375, + "learning_rate": 5.124845204894868e-08, + "loss": 0.0152, + "num_tokens": 2969332610.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9714090637535205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11706465736891618, + "kl": 0.435546875, + "learning_rate": 5.0647810352899565e-08, + "loss": 0.0174, + "num_tokens": 2969898578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.971579755910216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1593046648408358, + "kl": 0.40771484375, + "learning_rate": 5.005070031772752e-08, + "loss": 0.0163, + "num_tokens": 2970465090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9717504480669114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06663726140385284, + "kl": 0.3876953125, + "learning_rate": 4.945712215538301e-08, + "loss": 0.0155, + "num_tokens": 2971031746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9719211402236068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2033525202283118, + "kl": 0.42333984375, + "learning_rate": 4.886707607656638e-08, + "loss": 0.0169, + "num_tokens": 2971597570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9720918323803022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07518708824846491, + "kl": 0.4033203125, + "learning_rate": 4.828056229072231e-08, + "loss": 0.0161, + "num_tokens": 2972165250.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9722625245369976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9963677866854543, + "kl": 0.49609375, + "learning_rate": 4.769758100604316e-08, + "loss": 0.0198, + "num_tokens": 2972734130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9724332166936929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17781961788670322, + "kl": 0.404296875, + "learning_rate": 4.7118132429464506e-08, + "loss": 0.0162, + "num_tokens": 2973300690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9726039088503883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16851910209551113, + "kl": 0.41552734375, + "learning_rate": 4.654221676666959e-08, + "loss": 0.0166, + "num_tokens": 2973876594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9727746010070837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06118536096865977, + "kl": 0.39892578125, + "learning_rate": 4.596983422209045e-08, + "loss": 0.016, + "num_tokens": 2974442674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9729452931637791, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.337282561854967, + "kl": 0.82861328125, + "learning_rate": 4.5400984998899e-08, + "loss": 0.0331, + "num_tokens": 2975008594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9731159853204745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21085356560289148, + "kl": 0.4033203125, + "learning_rate": 4.483566929901817e-08, + "loss": 0.0161, + "num_tokens": 2975576402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9732866774771699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11531966970652462, + "kl": 0.44189453125, + "learning_rate": 4.4273887323113e-08, + "loss": 0.0177, + "num_tokens": 2976163714.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9734573696338653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07198664430924608, + "kl": 0.423828125, + "learning_rate": 4.37156392705973e-08, + "loss": 0.017, + "num_tokens": 2976743106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9736280617905607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10115260329181, + "kl": 0.408203125, + "learning_rate": 4.3160925339629233e-08, + "loss": 0.0163, + "num_tokens": 2977306450.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9737987539472561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1909545465546604, + "kl": 0.4404296875, + "learning_rate": 4.2609745727110185e-08, + "loss": 0.0176, + "num_tokens": 2977873314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9739694461039515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2304692121545416, + "kl": 0.47314453125, + "learning_rate": 4.2062100628691427e-08, + "loss": 0.0189, + "num_tokens": 2978442002.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9741401382606469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11919636735827852, + "kl": 0.43115234375, + "learning_rate": 4.151799023876524e-08, + "loss": 0.0173, + "num_tokens": 2979017074.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9743108304173423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3633361989368781, + "kl": 0.490234375, + "learning_rate": 4.0977414750471565e-08, + "loss": 0.0196, + "num_tokens": 2979578834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9744815225740378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3614289307408949, + "kl": 0.45263671875, + "learning_rate": 4.044037435569692e-08, + "loss": 0.0181, + "num_tokens": 2980144578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9746522147307332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15832469156156778, + "kl": 0.39404296875, + "learning_rate": 3.9906869245068816e-08, + "loss": 0.0158, + "num_tokens": 2980710050.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9748229068874286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11269701087102407, + "kl": 0.4443359375, + "learning_rate": 3.9376899607963536e-08, + "loss": 0.0178, + "num_tokens": 2981273650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.974993599044124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20423902221288476, + "kl": 0.3876953125, + "learning_rate": 3.885046563250061e-08, + "loss": 0.0155, + "num_tokens": 2981840130.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9751642912008194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25743431942848544, + "kl": 0.44775390625, + "learning_rate": 3.832756750554722e-08, + "loss": 0.0179, + "num_tokens": 2982414962.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9753349833575147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19987269364284602, + "kl": 0.42578125, + "learning_rate": 3.7808205412710463e-08, + "loss": 0.017, + "num_tokens": 2982983426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9755056755142101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27284692212639283, + "kl": 0.4267578125, + "learning_rate": 3.729237953834619e-08, + "loss": 0.0171, + "num_tokens": 2983548578.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9756763676709055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11788207468078671, + "kl": 0.427734375, + "learning_rate": 3.678009006555461e-08, + "loss": 0.0171, + "num_tokens": 2984113586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9758470598276009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0919935644436689, + "kl": 0.39208984375, + "learning_rate": 3.6271337176179146e-08, + "loss": 0.0157, + "num_tokens": 2984684562.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9760177519842963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13531772171626757, + "kl": 0.4287109375, + "learning_rate": 3.5766121050808675e-08, + "loss": 0.0171, + "num_tokens": 2985251810.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9761884441409917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16378321723785166, + "kl": 0.38671875, + "learning_rate": 3.526444186877753e-08, + "loss": 0.0155, + "num_tokens": 2985818866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9763591362976871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18641752422134852, + "kl": 0.443359375, + "learning_rate": 3.476629980816326e-08, + "loss": 0.0177, + "num_tokens": 2986380946.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9765298284543825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0653599328044199, + "kl": 0.43115234375, + "learning_rate": 3.427169504578776e-08, + "loss": 0.0173, + "num_tokens": 2986941442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9767005206110779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17754226506657167, + "kl": 0.5, + "learning_rate": 3.378062775721946e-08, + "loss": 0.02, + "num_tokens": 2987510482.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9768712127677733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16235849473280786, + "kl": 0.4833984375, + "learning_rate": 3.329309811676784e-08, + "loss": 0.0193, + "num_tokens": 2988077954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9770419049244687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08144911964150879, + "kl": 0.42822265625, + "learning_rate": 3.28091062974889e-08, + "loss": 0.0171, + "num_tokens": 2988645826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9772125970811641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2973488253130099, + "kl": 0.44287109375, + "learning_rate": 3.232865247118189e-08, + "loss": 0.0177, + "num_tokens": 2989209954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 2040.5859375, + "completions/mean_terminated_length": 150.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.9773832892378596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3171323161404236, + "kl": 0.4482421875, + "learning_rate": 3.1851736808391484e-08, + "loss": 0.0179, + "num_tokens": 2989775592.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.977553981394555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17270217975036695, + "kl": 0.38623046875, + "learning_rate": 3.137835947840451e-08, + "loss": 0.0155, + "num_tokens": 2990341464.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9777246735512504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1894934146403871, + "kl": 0.4208984375, + "learning_rate": 3.0908520649254316e-08, + "loss": 0.0168, + "num_tokens": 2990908040.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9778953657079458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1057752286133358, + "kl": 0.43603515625, + "learning_rate": 3.044222048771417e-08, + "loss": 0.0174, + "num_tokens": 2991467112.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9780660578646411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23070234131125011, + "kl": 0.48486328125, + "learning_rate": 2.997945915930722e-08, + "loss": 0.0194, + "num_tokens": 2992031832.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9782367500213365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15022845565586565, + "kl": 0.42138671875, + "learning_rate": 2.9520236828294303e-08, + "loss": 0.0169, + "num_tokens": 2992624216.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9784074421780319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15239434097886337, + "kl": 0.521484375, + "learning_rate": 2.9064553657683902e-08, + "loss": 0.0209, + "num_tokens": 2993197512.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9785781343347273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18041288912707798, + "kl": 0.42333984375, + "learning_rate": 2.8612409809226637e-08, + "loss": 0.0169, + "num_tokens": 2993763112.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9787488264914227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04999715704239304, + "kl": 0.40087890625, + "learning_rate": 2.8163805443417457e-08, + "loss": 0.016, + "num_tokens": 2994327128.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9789195186481181, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11196442554929892, + "kl": 0.4208984375, + "learning_rate": 2.7718740719494543e-08, + "loss": 0.0168, + "num_tokens": 2994907368.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9790902108048135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12472306397023179, + "kl": 0.4013671875, + "learning_rate": 2.7277215795440404e-08, + "loss": 0.0161, + "num_tokens": 2995468408.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9792609029615089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7020444066571352, + "kl": 0.576171875, + "learning_rate": 2.683923082797968e-08, + "loss": 0.0231, + "num_tokens": 2996029368.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9794315951182043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3116723700090347, + "kl": 0.439453125, + "learning_rate": 2.6404785972581337e-08, + "loss": 0.0176, + "num_tokens": 2996596104.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9796022872748997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13905564703258677, + "kl": 0.42822265625, + "learning_rate": 2.5973881383458687e-08, + "loss": 0.0171, + "num_tokens": 2997170632.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9797729794315951, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08354261448737604, + "kl": 0.4306640625, + "learning_rate": 2.5546517213566045e-08, + "loss": 0.0172, + "num_tokens": 2997737544.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9799436715882905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07451578223479287, + "kl": 0.41845703125, + "learning_rate": 2.5122693614602068e-08, + "loss": 0.0167, + "num_tokens": 2998303608.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.980114363744986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.458943095100323, + "kl": 0.50927734375, + "learning_rate": 2.4702410737010852e-08, + "loss": 0.0204, + "num_tokens": 2998871000.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9802850559016814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10724987542353909, + "kl": 0.4873046875, + "learning_rate": 2.4285668729975287e-08, + "loss": 0.0195, + "num_tokens": 2999438408.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9804557480583768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10111579543832667, + "kl": 0.4169921875, + "learning_rate": 2.387246774142482e-08, + "loss": 0.0167, + "num_tokens": 3000001784.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9806264402150722, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.188084576378783, + "kl": 0.50146484375, + "learning_rate": 2.3462807918031015e-08, + "loss": 0.02, + "num_tokens": 3000566888.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9807971323717675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11672245011800722, + "kl": 0.47998046875, + "learning_rate": 2.3056689405207555e-08, + "loss": 0.0192, + "num_tokens": 3001130792.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9809678245284629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13935198310184008, + "kl": 0.4228515625, + "learning_rate": 2.2654112347113567e-08, + "loss": 0.0169, + "num_tokens": 3001698120.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9811385166851583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1990925008019198, + "kl": 0.4453125, + "learning_rate": 2.2255076886646965e-08, + "loss": 0.0178, + "num_tokens": 3002259544.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9813092088418537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2780750935212599, + "kl": 0.431640625, + "learning_rate": 2.1859583165453336e-08, + "loss": 0.0173, + "num_tokens": 3002829688.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9814799009985491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0641322048884173, + "kl": 0.4775390625, + "learning_rate": 2.146763132391594e-08, + "loss": 0.0191, + "num_tokens": 3003401016.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9816505931552445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2209802828726106, + "kl": 0.4306640625, + "learning_rate": 2.1079221501166814e-08, + "loss": 0.0172, + "num_tokens": 3003968248.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9818212853119399, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23450813107906016, + "kl": 0.45703125, + "learning_rate": 2.0694353835074566e-08, + "loss": 0.0183, + "num_tokens": 3004529224.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9819919774686353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09451716800567161, + "kl": 0.44189453125, + "learning_rate": 2.0313028462254358e-08, + "loss": 0.0177, + "num_tokens": 3005097704.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9821626696253307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2635999711753064, + "kl": 0.43603515625, + "learning_rate": 1.9935245518063474e-08, + "loss": 0.0175, + "num_tokens": 3005659512.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9823333617820261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03864288789449007, + "kl": 0.3916015625, + "learning_rate": 1.9561005136601308e-08, + "loss": 0.0157, + "num_tokens": 3006222952.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9825040539387215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5618917590943844, + "kl": 0.41455078125, + "learning_rate": 1.9190307450708266e-08, + "loss": 0.0166, + "num_tokens": 3006789976.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9826747460954169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12253667559940461, + "kl": 0.39306640625, + "learning_rate": 1.8823152591970205e-08, + "loss": 0.0157, + "num_tokens": 3007354920.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9828454382521123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2389611449463554, + "kl": 0.455078125, + "learning_rate": 1.8459540690712875e-08, + "loss": 0.0182, + "num_tokens": 3007921816.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9830161304088078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06856787754177815, + "kl": 0.40185546875, + "learning_rate": 1.809947187600525e-08, + "loss": 0.0161, + "num_tokens": 3008481416.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9831868225655032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12864837403147725, + "kl": 0.50732421875, + "learning_rate": 1.7742946275659535e-08, + "loss": 0.0202, + "num_tokens": 3009044344.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9833575147221986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10812762306799728, + "kl": 0.41455078125, + "learning_rate": 1.7389964016228943e-08, + "loss": 0.0166, + "num_tokens": 3009610920.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9835282068788939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03441499601415952, + "kl": 0.4111328125, + "learning_rate": 1.704052522300992e-08, + "loss": 0.0164, + "num_tokens": 3010180696.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9836988990355893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6650507830173277, + "kl": 0.47021484375, + "learning_rate": 1.6694630020041014e-08, + "loss": 0.0188, + "num_tokens": 3010743736.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9838695911922847, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1898129447804965, + "kl": 0.38330078125, + "learning_rate": 1.6352278530100683e-08, + "loss": 0.0153, + "num_tokens": 3011308792.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9840402833489801, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2479764494998173, + "kl": 0.5263671875, + "learning_rate": 1.6013470874712833e-08, + "loss": 0.0211, + "num_tokens": 3011872216.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9842109755056755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.40172398873184123, + "kl": 0.50244140625, + "learning_rate": 1.567820717414126e-08, + "loss": 0.0201, + "num_tokens": 3012445368.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9843816676623709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32300318455613025, + "kl": 0.4423828125, + "learning_rate": 1.5346487547392984e-08, + "loss": 0.0177, + "num_tokens": 3013012408.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9845523598190663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14767167376596987, + "kl": 0.4052734375, + "learning_rate": 1.501831211221716e-08, + "loss": 0.0162, + "num_tokens": 3013585592.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9847230519757617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2014019261480226, + "kl": 0.4248046875, + "learning_rate": 1.4693680985102821e-08, + "loss": 0.017, + "num_tokens": 3014156216.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9848937441324571, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13318753328331556, + "kl": 0.44189453125, + "learning_rate": 1.4372594281282237e-08, + "loss": 0.0177, + "num_tokens": 3014722888.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9850644362891525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060812858101263675, + "kl": 0.42529296875, + "learning_rate": 1.4055052114730905e-08, + "loss": 0.017, + "num_tokens": 3015291656.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9852351284458479, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17543540962336934, + "kl": 0.443359375, + "learning_rate": 1.3741054598164216e-08, + "loss": 0.0178, + "num_tokens": 3015857608.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9854058206025433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13717030054606727, + "kl": 0.431640625, + "learning_rate": 1.3430601843039681e-08, + "loss": 0.0173, + "num_tokens": 3016424888.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9855765127592387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15744043344731032, + "kl": 0.431640625, + "learning_rate": 1.3123693959556927e-08, + "loss": 0.0173, + "num_tokens": 3016986904.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9857472049159341, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1529390512875084, + "kl": 0.40869140625, + "learning_rate": 1.2820331056657698e-08, + "loss": 0.0164, + "num_tokens": 3017554520.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9859178970726296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08700504615252891, + "kl": 0.45654296875, + "learning_rate": 1.2520513242023636e-08, + "loss": 0.0183, + "num_tokens": 3018118344.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.986088589229325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22703678975661207, + "kl": 0.412109375, + "learning_rate": 1.222424062208072e-08, + "loss": 0.0165, + "num_tokens": 3018687352.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9862592813860203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08677902284575552, + "kl": 0.42919921875, + "learning_rate": 1.1931513301994823e-08, + "loss": 0.0172, + "num_tokens": 3019255224.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9864299735427157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3503931489418858, + "kl": 0.52392578125, + "learning_rate": 1.1642331385672834e-08, + "loss": 0.021, + "num_tokens": 3019819032.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9866006656994111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2346195191547291, + "kl": 0.43408203125, + "learning_rate": 1.135669497576375e-08, + "loss": 0.0174, + "num_tokens": 3020385224.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9867713578561065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20016003770449045, + "kl": 0.4951171875, + "learning_rate": 1.1074604173658688e-08, + "loss": 0.0198, + "num_tokens": 3020953720.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9869420500128019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07128643975685525, + "kl": 0.36279296875, + "learning_rate": 1.0796059079489774e-08, + "loss": 0.0145, + "num_tokens": 3021517384.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9871127421694973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19472106342014173, + "kl": 0.4638671875, + "learning_rate": 1.0521059792130139e-08, + "loss": 0.0186, + "num_tokens": 3022083240.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9872834343261927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2193001634158105, + "kl": 0.4658203125, + "learning_rate": 1.0249606409195035e-08, + "loss": 0.0186, + "num_tokens": 3022650024.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9874541264828881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4655401773664164, + "kl": 0.4580078125, + "learning_rate": 9.981699027040714e-09, + "loss": 0.0183, + "num_tokens": 3023215800.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9876248186395835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11211473317388553, + "kl": 0.42138671875, + "learning_rate": 9.717337740763334e-09, + "loss": 0.0169, + "num_tokens": 3023787384.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9877955107962789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06951747754068893, + "kl": 0.38525390625, + "learning_rate": 9.45652264420338e-09, + "loss": 0.0154, + "num_tokens": 3024352648.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9879662029529743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38238315193856265, + "kl": 0.48046875, + "learning_rate": 9.199253829940136e-09, + "loss": 0.0192, + "num_tokens": 3024917368.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9881368951096697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17154855562303079, + "kl": 0.44091796875, + "learning_rate": 8.945531389293881e-09, + "loss": 0.0176, + "num_tokens": 3025482664.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9883075872663651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10271670400733962, + "kl": 0.44189453125, + "learning_rate": 8.695355412328133e-09, + "loss": 0.0177, + "num_tokens": 3026050648.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9884782794230605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1440524199615135, + "kl": 0.45263671875, + "learning_rate": 8.448725987846295e-09, + "loss": 0.0181, + "num_tokens": 3026612456.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.988648971579756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05965272596788457, + "kl": 0.419921875, + "learning_rate": 8.205643203391678e-09, + "loss": 0.0168, + "num_tokens": 3027174744.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9888196637364514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10760728101540688, + "kl": 0.48681640625, + "learning_rate": 7.966107145249702e-09, + "loss": 0.0194, + "num_tokens": 3027737832.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9889903558931467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11439855992638803, + "kl": 0.412109375, + "learning_rate": 7.73011789845013e-09, + "loss": 0.0165, + "num_tokens": 3028302344.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9891610480498421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0940892475375637, + "kl": 0.4033203125, + "learning_rate": 7.497675546757067e-09, + "loss": 0.0161, + "num_tokens": 3028874184.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9893317402065375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2538331636049161, + "kl": 0.45849609375, + "learning_rate": 7.268780172681178e-09, + "loss": 0.0183, + "num_tokens": 3029444744.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9895024323632329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2107495539926659, + "kl": 0.42578125, + "learning_rate": 7.043431857470806e-09, + "loss": 0.017, + "num_tokens": 3030003880.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9896731245199283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18073031507385934, + "kl": 0.39306640625, + "learning_rate": 6.821630681117519e-09, + "loss": 0.0157, + "num_tokens": 3030569560.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9898438166766237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12917989083371903, + "kl": 0.4453125, + "learning_rate": 6.603376722352783e-09, + "loss": 0.0179, + "num_tokens": 3031137192.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9900145088333191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09883440625332902, + "kl": 0.4384765625, + "learning_rate": 6.388670058647961e-09, + "loss": 0.0175, + "num_tokens": 3031697832.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9901852009900145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12339547693257982, + "kl": 0.4296875, + "learning_rate": 6.177510766216532e-09, + "loss": 0.0172, + "num_tokens": 3032264008.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9903558931467099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34317579250199776, + "kl": 0.455078125, + "learning_rate": 5.969898920011874e-09, + "loss": 0.0182, + "num_tokens": 3032832792.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9905265853034053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2192092230775142, + "kl": 0.4326171875, + "learning_rate": 5.76583459373059e-09, + "loss": 0.0173, + "num_tokens": 3033402504.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9906972774601007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11382243530445584, + "kl": 0.419921875, + "learning_rate": 5.565317859805852e-09, + "loss": 0.0168, + "num_tokens": 3033965048.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9908679696167961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24265177354033482, + "kl": 0.46875, + "learning_rate": 5.368348789415168e-09, + "loss": 0.0187, + "num_tokens": 3034543384.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9910386617734915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1865653920818331, + "kl": 0.45361328125, + "learning_rate": 5.174927452475942e-09, + "loss": 0.0181, + "num_tokens": 3035107544.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9912093539301869, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1942171199636518, + "kl": 0.46240234375, + "learning_rate": 4.9850539176443666e-09, + "loss": 0.0185, + "num_tokens": 3035678280.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9913800460868823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5712312327609492, + "kl": 0.47021484375, + "learning_rate": 4.798728252318752e-09, + "loss": 0.0188, + "num_tokens": 3036241032.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9915507382435778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17487596036624262, + "kl": 0.431640625, + "learning_rate": 4.615950522639523e-09, + "loss": 0.0173, + "num_tokens": 3036804744.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9917214304002732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.360630247466372, + "kl": 0.4169921875, + "learning_rate": 4.436720793484784e-09, + "loss": 0.0167, + "num_tokens": 3037364184.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9918921225569685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0922218099043857, + "kl": 0.43896484375, + "learning_rate": 4.261039128474753e-09, + "loss": 0.0176, + "num_tokens": 3037930552.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9920628147136639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41607830611084046, + "kl": 0.43115234375, + "learning_rate": 4.08890558997177e-09, + "loss": 0.0172, + "num_tokens": 3038498104.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9922335068703593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26327262952396285, + "kl": 0.44921875, + "learning_rate": 3.9203202390747375e-09, + "loss": 0.018, + "num_tokens": 3039065000.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9924041990270547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2734600904243899, + "kl": 0.49072265625, + "learning_rate": 3.755283135625787e-09, + "loss": 0.0196, + "num_tokens": 3039630552.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9925748911837501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.167945570596957, + "kl": 0.44921875, + "learning_rate": 3.5937943382080565e-09, + "loss": 0.018, + "num_tokens": 3040196664.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9927455833404455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23095485228234555, + "kl": 0.42333984375, + "learning_rate": 3.4358539041434712e-09, + "loss": 0.0169, + "num_tokens": 3040758920.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9929162754971409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17846857551656264, + "kl": 0.45263671875, + "learning_rate": 3.2814618894960736e-09, + "loss": 0.0181, + "num_tokens": 3041326616.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9930869676538363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15128263275385304, + "kl": 0.39697265625, + "learning_rate": 3.1306183490686926e-09, + "loss": 0.0159, + "num_tokens": 3041893640.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9932576598105317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1847372942358078, + "kl": 0.4326171875, + "learning_rate": 2.983323336405164e-09, + "loss": 0.0173, + "num_tokens": 3042466184.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9934283519672271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1666516007462534, + "kl": 0.435546875, + "learning_rate": 2.839576903790331e-09, + "loss": 0.0174, + "num_tokens": 3043033800.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9935990441239225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12263387896730829, + "kl": 0.3984375, + "learning_rate": 2.6993791022489336e-09, + "loss": 0.0159, + "num_tokens": 3043605496.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9937697362806179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05926947619861906, + "kl": 0.4228515625, + "learning_rate": 2.5627299815467188e-09, + "loss": 0.0169, + "num_tokens": 3044172152.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9939404284373133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04263048972867984, + "kl": 0.38232421875, + "learning_rate": 2.4296295901882204e-09, + "loss": 0.0153, + "num_tokens": 3044736968.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9941111205940087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0944435512144109, + "kl": 0.47412109375, + "learning_rate": 2.30007797542009e-09, + "loss": 0.0189, + "num_tokens": 3045301560.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9942818127507042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12061876654871692, + "kl": 0.419921875, + "learning_rate": 2.1740751832266538e-09, + "loss": 0.0168, + "num_tokens": 3045860760.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9944525049073996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1814959890069922, + "kl": 0.43212890625, + "learning_rate": 2.051621258337688e-09, + "loss": 0.0173, + "num_tokens": 3046426344.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9946231970640949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13871326524947583, + "kl": 0.3740234375, + "learning_rate": 1.932716244216204e-09, + "loss": 0.0149, + "num_tokens": 3046995096.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9947938892207903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1053742356164613, + "kl": 0.45361328125, + "learning_rate": 1.8173601830717701e-09, + "loss": 0.0182, + "num_tokens": 3047557944.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9949645813774857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19315012058224126, + "kl": 0.41845703125, + "learning_rate": 1.7055531158505223e-09, + "loss": 0.0167, + "num_tokens": 3048120552.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9951352735341811, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09193818612471168, + "kl": 0.43115234375, + "learning_rate": 1.597295082240713e-09, + "loss": 0.0173, + "num_tokens": 3048684152.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9953059656908765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4449319688663756, + "kl": 0.38818359375, + "learning_rate": 1.4925861206693814e-09, + "loss": 0.0155, + "num_tokens": 3049253992.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9954766578475719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12372382742119419, + "kl": 0.416015625, + "learning_rate": 1.3914262683045744e-09, + "loss": 0.0166, + "num_tokens": 3049823880.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9956473500042673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10910055648330101, + "kl": 0.39306640625, + "learning_rate": 1.2938155610542347e-09, + "loss": 0.0157, + "num_tokens": 3050392040.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9958180421609627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39519441978743497, + "kl": 0.44140625, + "learning_rate": 1.199754033567313e-09, + "loss": 0.0177, + "num_tokens": 3050954504.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9959887343176581, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.099843966081086, + "kl": 0.66650390625, + "learning_rate": 1.109241719230436e-09, + "loss": 0.0266, + "num_tokens": 3051517416.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9961594264743535, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4427600689408109, + "kl": 0.45947265625, + "learning_rate": 1.0222786501745685e-09, + "loss": 0.0184, + "num_tokens": 3052080280.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9963301186310489, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1620600630331803, + "kl": 0.388671875, + "learning_rate": 9.388648572672409e-10, + "loss": 0.0155, + "num_tokens": 3052645496.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9965008107877443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4132420134183512, + "kl": 0.41357421875, + "learning_rate": 8.590003701181016e-10, + "loss": 0.0165, + "num_tokens": 3053211144.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9966715029444397, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5258247767515887, + "kl": 0.5693359375, + "learning_rate": 7.826852170744748e-10, + "loss": 0.0228, + "num_tokens": 3053787592.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9968421951011351, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05998708555213525, + "kl": 0.38037109375, + "learning_rate": 7.099194252269126e-10, + "loss": 0.0152, + "num_tokens": 3054356008.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9970128872578305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25275897170016126, + "kl": 0.484375, + "learning_rate": 6.407030204047537e-10, + "loss": 0.0194, + "num_tokens": 3054920712.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.997183579414526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1487141309024087, + "kl": 0.3984375, + "learning_rate": 5.750360271761235e-10, + "loss": 0.0159, + "num_tokens": 3055485720.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9973542715712213, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6738943698476796, + "kl": 0.55810546875, + "learning_rate": 5.129184688523747e-10, + "loss": 0.0224, + "num_tokens": 3056056632.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9975249637279167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14429885285824423, + "kl": 0.44921875, + "learning_rate": 4.543503674803162e-10, + "loss": 0.018, + "num_tokens": 3056617752.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9976956558846121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14620188190652403, + "kl": 0.435546875, + "learning_rate": 3.993317438522049e-10, + "loss": 0.0174, + "num_tokens": 3057180200.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9978663480413075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041142925412550796, + "kl": 0.45458984375, + "learning_rate": 3.4786261749575336e-10, + "loss": 0.0182, + "num_tokens": 3057742184.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9980370401980029, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045362239947927305, + "kl": 0.39111328125, + "learning_rate": 2.99943006681902e-10, + "loss": 0.0157, + "num_tokens": 3058304040.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9982077323546983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19000972908293903, + "kl": 0.4345703125, + "learning_rate": 2.555729284192676e-10, + "loss": 0.0174, + "num_tokens": 3058867160.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9983784245113937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10201585470049229, + "kl": 0.38427734375, + "learning_rate": 2.147523984585842e-10, + "loss": 0.0154, + "num_tokens": 3059432248.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9985491166680891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20454195615300472, + "kl": 0.416015625, + "learning_rate": 1.7748143128937246e-10, + "loss": 0.0167, + "num_tokens": 3060001400.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9987198088247845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08476601716426757, + "kl": 0.42578125, + "learning_rate": 1.4376004014216017e-10, + "loss": 0.017, + "num_tokens": 3060566264.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9988905009814799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05594301120184792, + "kl": 0.42333984375, + "learning_rate": 1.1358823698404131e-10, + "loss": 0.0169, + "num_tokens": 3061130200.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9990611931381753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14343895945497648, + "kl": 0.40869140625, + "learning_rate": 8.696603252866808e-11, + "loss": 0.0164, + "num_tokens": 3061696328.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9992318852948707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0781725335277288, + "kl": 0.44091796875, + "learning_rate": 6.389343622403844e-11, + "loss": 0.0176, + "num_tokens": 3062258328.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9994025774515661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16905108419891599, + "kl": 0.41943359375, + "learning_rate": 4.437045625915737e-11, + "loss": 0.0168, + "num_tokens": 3062819496.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9995732696082615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3066146041276291, + "kl": 0.4287109375, + "learning_rate": 2.839709956625747e-11, + "loss": 0.0171, + "num_tokens": 3063382760.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.999743961764957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06447351533378298, + "kl": 0.3828125, + "learning_rate": 1.5973371813027273e-11, + "loss": 0.0153, + "num_tokens": 3063952840.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9999146539216524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2984308644724824, + "kl": 0.43359375, + "learning_rate": 7.099277411493077e-12, + "loss": 0.0173, + "num_tokens": 3064519096.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 5858 + }, + { + "epoch": 0.9999146539216524, + "step": 5858, + "total_flos": 0.0, + "train_loss": 0.0003539126314866376, + "train_runtime": 8111.4207, + "train_samples_per_second": 11.556, + "train_steps_per_second": 0.722 + } + ], + "logging_steps": 1, + "max_steps": 5859, + "num_input_tokens_seen": 3064519096, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}