diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14351 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9986564064981067, + "eval_steps": 500, + "global_step": 511, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6272.0, + "completions/max_terminated_length": 6272.0, + "completions/mean_length": 610.736328125, + "completions/mean_terminated_length": 611.9315185546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0019543178209356295, + "grad_norm": 2.750488585160197, + "learning_rate": 0.0, + "loss": -0.0362, + "num_tokens": 372153.0, + "reward": 0.099609375, + "reward_std": 0.2192307561635971, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.02734375, + "rewards/soft_format_reward/std": 0.16324250400066376, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5907.0, + "completions/max_terminated_length": 5907.0, + "completions/mean_length": 603.810546875, + "completions/mean_terminated_length": 603.810546875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.003908635641871259, + "grad_norm": 0.8027496316458539, + "learning_rate": 1.923076923076923e-08, + "loss": -0.0467, + "num_tokens": 743064.0, + "reward": 0.1171875, + "reward_std": 0.23476263880729675, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.01953125, + "rewards/soft_format_reward/std": 0.1385180652141571, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4851.0, + "completions/max_terminated_length": 4851.0, + "completions/mean_length": 608.98046875, + "completions/mean_terminated_length": 608.98046875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.005862953462806889, + "grad_norm": 0.7149794525332026, + "learning_rate": 3.846153846153846e-08, + "loss": -0.0427, + "num_tokens": 1113998.0, + "reward": 0.1484375, + "reward_std": 0.2804192304611206, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.01171875, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4879.0, + "completions/max_terminated_length": 4879.0, + "completions/mean_length": 564.45703125, + "completions/mean_terminated_length": 565.5616455078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.007817271283742518, + "grad_norm": 0.6974425166032573, + "learning_rate": 5.7692307692307695e-08, + "loss": -0.0598, + "num_tokens": 1458728.0, + "reward": 0.123046875, + "reward_std": 0.26185888051986694, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.0234375, + "rewards/soft_format_reward/std": 0.15143637359142303, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7374.0, + "completions/max_terminated_length": 7374.0, + "completions/mean_length": 569.392578125, + "completions/mean_terminated_length": 569.392578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.009771589104678149, + "grad_norm": 2.8601410971054486, + "learning_rate": 7.692307692307692e-08, + "loss": -0.0948, + "num_tokens": 1810753.0, + "reward": 0.072265625, + "reward_std": 0.17700009047985077, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.01953125, + "rewards/soft_format_reward/std": 0.1385180652141571, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8142.0, + "completions/max_terminated_length": 8142.0, + "completions/mean_length": 597.33203125, + "completions/mean_terminated_length": 597.33203125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.011725906925613778, + "grad_norm": 0.6274617565453047, + "learning_rate": 9.615384615384616e-08, + "loss": -0.0875, + "num_tokens": 2182875.0, + "reward": 0.1015625, + "reward_std": 0.21828904747962952, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.0078125, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4679.0, + "completions/max_terminated_length": 4679.0, + "completions/mean_length": 517.54296875, + "completions/mean_terminated_length": 517.54296875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.013680224746549407, + "grad_norm": 0.5770217307222774, + "learning_rate": 1.1538461538461539e-07, + "loss": -0.0537, + "num_tokens": 2513137.0, + "reward": 0.134765625, + "reward_std": 0.2323426902294159, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.0234375, + "rewards/soft_format_reward/std": 0.15143637359142303, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6290.0, + "completions/max_terminated_length": 6290.0, + "completions/mean_length": 571.541015625, + "completions/mean_terminated_length": 572.6594848632812, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.015634542567485036, + "grad_norm": 0.6641257092021481, + "learning_rate": 1.346153846153846e-07, + "loss": -0.0872, + "num_tokens": 2857622.0, + "reward": 0.1298828125, + "reward_std": 0.2664027810096741, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.017578125, + "rewards/soft_format_reward/std": 0.13154059648513794, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5581.0, + "completions/max_terminated_length": 5581.0, + "completions/mean_length": 608.23046875, + "completions/mean_terminated_length": 609.4207153320312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.01758886038842067, + "grad_norm": 0.592238878597358, + "learning_rate": 1.5384615384615385e-07, + "loss": -0.0436, + "num_tokens": 3239580.0, + "reward": 0.1357421875, + "reward_std": 0.2543027400970459, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.033203125, + "rewards/soft_format_reward/std": 0.17934183776378632, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4595.0, + "completions/max_terminated_length": 4595.0, + "completions/mean_length": 603.40625, + "completions/mean_terminated_length": 603.40625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.019543178209356298, + "grad_norm": 0.5915306711288894, + "learning_rate": 1.7307692307692305e-07, + "loss": -0.084, + "num_tokens": 3613004.0, + "reward": 0.119140625, + "reward_std": 0.2114141583442688, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.01953125, + "rewards/soft_format_reward/std": 0.1385180652141571, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6720.0, + "completions/max_terminated_length": 6720.0, + "completions/mean_length": 594.95703125, + "completions/mean_terminated_length": 596.121337890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.021497496030291927, + "grad_norm": 0.7073230239287356, + "learning_rate": 1.9230769230769231e-07, + "loss": -0.0558, + "num_tokens": 3975382.0, + "reward": 0.1318359375, + "reward_std": 0.22544629871845245, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.029296875, + "rewards/soft_format_reward/std": 0.16880230605602264, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4646.0, + "completions/max_terminated_length": 4646.0, + "completions/mean_length": 561.0, + "completions/mean_terminated_length": 561.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.023451813851227556, + "grad_norm": 0.7142101864966569, + "learning_rate": 2.1153846153846152e-07, + "loss": -0.064, + "num_tokens": 4320118.0, + "reward": 0.1123046875, + "reward_std": 0.2140614241361618, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.021484375, + "rewards/soft_format_reward/std": 0.14513419568538666, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7207.0, + "completions/max_terminated_length": 7207.0, + "completions/mean_length": 642.076171875, + "completions/mean_terminated_length": 642.076171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.025406131672163185, + "grad_norm": 0.6472985857105006, + "learning_rate": 2.3076923076923078e-07, + "loss": -0.0488, + "num_tokens": 4706845.0, + "reward": 0.1201171875, + "reward_std": 0.23529882729053497, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.029296875, + "rewards/soft_format_reward/std": 0.16880230605602264, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6496.0, + "completions/max_terminated_length": 6496.0, + "completions/mean_length": 638.82421875, + "completions/mean_terminated_length": 638.82421875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.027360449493098814, + "grad_norm": 0.5085482848573109, + "learning_rate": 2.5e-07, + "loss": -0.0224, + "num_tokens": 5101203.0, + "reward": 0.1044921875, + "reward_std": 0.18590326607227325, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.025390625, + "rewards/soft_format_reward/std": 0.15746226906776428, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4915.0, + "completions/max_terminated_length": 4915.0, + "completions/mean_length": 604.025390625, + "completions/mean_terminated_length": 604.025390625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.029314767314034446, + "grad_norm": 0.5698167480527412, + "learning_rate": 2.692307692307692e-07, + "loss": -0.047, + "num_tokens": 5473984.0, + "reward": 0.080078125, + "reward_std": 0.1659148782491684, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.015625, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3868.0, + "completions/max_terminated_length": 3868.0, + "completions/mean_length": 584.705078125, + "completions/mean_terminated_length": 584.705078125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03126908513497007, + "grad_norm": 0.5677167605898067, + "learning_rate": 2.884615384615384e-07, + "loss": -0.0625, + "num_tokens": 5833241.0, + "reward": 0.1337890625, + "reward_std": 0.2450886368751526, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.013671875, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4481.0, + "completions/max_terminated_length": 4481.0, + "completions/mean_length": 643.1484375, + "completions/mean_terminated_length": 644.4070434570312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0332234029559057, + "grad_norm": 1.2139787553678811, + "learning_rate": 3.076923076923077e-07, + "loss": -0.034, + "num_tokens": 6225557.0, + "reward": 0.1240234375, + "reward_std": 0.2256689965724945, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.029296875, + "rewards/soft_format_reward/std": 0.16880230605602264, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6754.0, + "completions/max_terminated_length": 6754.0, + "completions/mean_length": 709.3984375, + "completions/mean_terminated_length": 709.3984375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.03517772077684134, + "grad_norm": 18.267841869620774, + "learning_rate": 3.269230769230769e-07, + "loss": -0.0368, + "num_tokens": 6650273.0, + "reward": 0.1630859375, + "reward_std": 0.26679402589797974, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.052734375, + "rewards/soft_format_reward/std": 0.22372129559516907, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 3826.0, + "completions/mean_length": 629.759765625, + "completions/mean_terminated_length": 629.759765625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.037132038597776966, + "grad_norm": 0.8902475142083207, + "learning_rate": 3.461538461538461e-07, + "loss": 0.0081, + "num_tokens": 7039542.0, + "reward": 0.0986328125, + "reward_std": 0.202886700630188, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.28054583072662354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.025390625, + "rewards/soft_format_reward/std": 0.15746226906776428, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7055.0, + "completions/max_terminated_length": 7055.0, + "completions/mean_length": 630.90625, + "completions/mean_terminated_length": 632.140869140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.039086356418712595, + "grad_norm": 2.274014498101533, + "learning_rate": 3.6538461538461534e-07, + "loss": -0.0246, + "num_tokens": 7422502.0, + "reward": 0.1328125, + "reward_std": 0.25953900814056396, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.03125, + "rewards/soft_format_reward/std": 0.17416280508041382, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5356.0, + "completions/max_terminated_length": 5356.0, + "completions/mean_length": 624.4765625, + "completions/mean_terminated_length": 624.4765625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.041040674239648224, + "grad_norm": 1.5399912436450514, + "learning_rate": 3.8461538461538463e-07, + "loss": -0.0689, + "num_tokens": 7803706.0, + "reward": 0.1337890625, + "reward_std": 0.2392386794090271, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.041015625, + "rewards/soft_format_reward/std": 0.19852031767368317, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5844.0, + "completions/max_terminated_length": 5844.0, + "completions/mean_length": 678.767578125, + "completions/mean_terminated_length": 678.767578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04299499206058385, + "grad_norm": 1.8287057213470026, + "learning_rate": 4.0384615384615386e-07, + "loss": -0.0303, + "num_tokens": 8199299.0, + "reward": 0.185546875, + "reward_std": 0.33074378967285156, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.03515625, + "rewards/soft_format_reward/std": 0.1843547374010086, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5569.0, + "completions/max_terminated_length": 5569.0, + "completions/mean_length": 691.244140625, + "completions/mean_terminated_length": 693.9549560546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04494930988151948, + "grad_norm": 2.9910781885362914, + "learning_rate": 4.2307692307692304e-07, + "loss": -0.0044, + "num_tokens": 8607776.0, + "reward": 0.1552734375, + "reward_std": 0.2452322244644165, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.041015625, + "rewards/soft_format_reward/std": 0.19852031767368317, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4047.0, + "completions/max_terminated_length": 4047.0, + "completions/mean_length": 731.48828125, + "completions/mean_terminated_length": 732.9197387695312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04690362770245511, + "grad_norm": 1.0977506052692678, + "learning_rate": 4.423076923076923e-07, + "loss": 0.0162, + "num_tokens": 9040074.0, + "reward": 0.123046875, + "reward_std": 0.23959577083587646, + "rewards/accuracy_reward/mean": 0.107421875, + "rewards/accuracy_reward/std": 0.30995169281959534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.03125, + "rewards/soft_format_reward/std": 0.17416280508041382, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6061.0, + "completions/max_terminated_length": 6061.0, + "completions/mean_length": 685.828125, + "completions/mean_terminated_length": 685.828125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.04885794552339074, + "grad_norm": 2.9223355505111437, + "learning_rate": 4.6153846153846156e-07, + "loss": -0.0147, + "num_tokens": 9452242.0, + "reward": 0.1552734375, + "reward_std": 0.2749112844467163, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.044921875, + "rewards/soft_format_reward/std": 0.20733514428138733, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7938.0, + "completions/max_terminated_length": 7938.0, + "completions/mean_length": 833.51171875, + "completions/mean_terminated_length": 833.51171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05081226334432637, + "grad_norm": 0.48396637853658414, + "learning_rate": 4.807692307692307e-07, + "loss": 0.0375, + "num_tokens": 9936648.0, + "reward": 0.1767578125, + "reward_std": 0.29792705178260803, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.056640625, + "rewards/soft_format_reward/std": 0.23138070106506348, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5735.0, + "completions/max_terminated_length": 5735.0, + "completions/mean_length": 804.255859375, + "completions/mean_terminated_length": 805.8297119140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.052766581165262, + "grad_norm": 2.5518985574643884, + "learning_rate": 5e-07, + "loss": 0.053, + "num_tokens": 10405147.0, + "reward": 0.193359375, + "reward_std": 0.2737465500831604, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.06640625, + "rewards/soft_format_reward/std": 0.2492343932390213, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6042.0, + "completions/max_terminated_length": 6042.0, + "completions/mean_length": 802.859375, + "completions/mean_terminated_length": 804.4305419921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05472089898619763, + "grad_norm": 5.369620828464795, + "learning_rate": 5.192307692307692e-07, + "loss": 0.0288, + "num_tokens": 10873123.0, + "reward": 0.158203125, + "reward_std": 0.25793004035949707, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.06640625, + "rewards/soft_format_reward/std": 0.2492343932390213, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6438.0, + "completions/max_terminated_length": 6438.0, + "completions/mean_length": 728.625, + "completions/mean_terminated_length": 730.0509033203125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.056675216807133263, + "grad_norm": 8.780096510908749, + "learning_rate": 5.384615384615384e-07, + "loss": 0.0097, + "num_tokens": 11307587.0, + "reward": 0.1484375, + "reward_std": 0.2401176393032074, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.06640625, + "rewards/soft_format_reward/std": 0.2492343932390213, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4252.0, + "completions/max_terminated_length": 4252.0, + "completions/mean_length": 671.765625, + "completions/mean_terminated_length": 671.765625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.05862953462806889, + "grad_norm": 10.719423466037227, + "learning_rate": 5.576923076923077e-07, + "loss": 0.0005, + "num_tokens": 11717339.0, + "reward": 0.208984375, + "reward_std": 0.3264630138874054, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.12109375, + "rewards/soft_format_reward/std": 0.3265552520751953, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5691.0, + "completions/max_terminated_length": 5691.0, + "completions/mean_length": 621.12109375, + "completions/mean_terminated_length": 621.12109375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06058385244900452, + "grad_norm": 4.768306794519214, + "learning_rate": 5.769230769230768e-07, + "loss": -0.0039, + "num_tokens": 12094617.0, + "reward": 0.27734375, + "reward_std": 0.38995561003685, + "rewards/accuracy_reward/mean": 0.228515625, + "rewards/accuracy_reward/std": 0.4202871024608612, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.09765625, + "rewards/soft_format_reward/std": 0.29713961482048035, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6333.0, + "completions/max_terminated_length": 6333.0, + "completions/mean_length": 727.05078125, + "completions/mean_terminated_length": 728.4735717773438, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06253817026994014, + "grad_norm": 1.6979812063171262, + "learning_rate": 5.961538461538461e-07, + "loss": 0.0324, + "num_tokens": 12530307.0, + "reward": 0.1943359375, + "reward_std": 0.3103134036064148, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.095703125, + "rewards/soft_format_reward/std": 0.2944713830947876, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8070.0, + "completions/max_terminated_length": 8070.0, + "completions/mean_length": 814.3515625, + "completions/mean_terminated_length": 814.3515625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06449248809087578, + "grad_norm": 2.1367180290106966, + "learning_rate": 6.153846153846154e-07, + "loss": 0.048, + "num_tokens": 13004151.0, + "reward": 0.201171875, + "reward_std": 0.3196222186088562, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.15625, + "rewards/soft_format_reward/std": 0.36344730854034424, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6720.0, + "completions/max_terminated_length": 6720.0, + "completions/mean_length": 703.599609375, + "completions/mean_terminated_length": 703.599609375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0664468059118114, + "grad_norm": 2.616821676633914, + "learning_rate": 6.346153846153845e-07, + "loss": 0.03, + "num_tokens": 13423754.0, + "reward": 0.2626953125, + "reward_std": 0.370841920375824, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.162109375, + "rewards/soft_format_reward/std": 0.3689115643501282, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4734.0, + "completions/max_terminated_length": 4734.0, + "completions/mean_length": 768.07421875, + "completions/mean_terminated_length": 769.5772705078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.06840112373274704, + "grad_norm": 2.774593070611475, + "learning_rate": 6.538461538461538e-07, + "loss": -0.0187, + "num_tokens": 13874656.0, + "reward": 0.228515625, + "reward_std": 0.32226231694221497, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.1640625, + "rewards/soft_format_reward/std": 0.37069445848464966, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7272.0, + "completions/max_terminated_length": 7272.0, + "completions/mean_length": 745.40234375, + "completions/mean_terminated_length": 746.8610229492188, + "completions/min_length": 0.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07035544155368267, + "grad_norm": 57.53131936684584, + "learning_rate": 6.730769230769231e-07, + "loss": 0.052, + "num_tokens": 14323534.0, + "reward": 0.224609375, + "reward_std": 0.32410329580307007, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.1484375, + "rewards/soft_format_reward/std": 0.35588082671165466, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5527.0, + "completions/max_terminated_length": 5527.0, + "completions/mean_length": 717.23046875, + "completions/mean_terminated_length": 718.634033203125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.0723097593746183, + "grad_norm": 1.684332372874716, + "learning_rate": 6.923076923076922e-07, + "loss": 0.0461, + "num_tokens": 14749252.0, + "reward": 0.275390625, + "reward_std": 0.35603415966033936, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.203125, + "rewards/soft_format_reward/std": 0.4027182459831238, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5574.0, + "completions/max_terminated_length": 5574.0, + "completions/mean_length": 750.1015625, + "completions/mean_terminated_length": 753.0431518554688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.07426407719555393, + "grad_norm": 15.86056017140293, + "learning_rate": 7.115384615384616e-07, + "loss": 0.0358, + "num_tokens": 15198504.0, + "reward": 0.2744140625, + "reward_std": 0.34245699644088745, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.232421875, + "rewards/soft_format_reward/std": 0.42278963327407837, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4598.0, + "completions/max_terminated_length": 4598.0, + "completions/mean_length": 625.771484375, + "completions/mean_terminated_length": 625.771484375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.07621839501648955, + "grad_norm": 1.4541591956995843, + "learning_rate": 7.307692307692307e-07, + "loss": 0.0404, + "num_tokens": 15574787.0, + "reward": 0.361328125, + "reward_std": 0.3739612102508545, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41380295157432556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.28515625, + "rewards/soft_format_reward/std": 0.45193037390708923, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4275.0, + "completions/max_terminated_length": 4275.0, + "completions/mean_length": 623.33984375, + "completions/mean_terminated_length": 623.33984375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.07817271283742519, + "grad_norm": 1.8501398156592497, + "learning_rate": 7.5e-07, + "loss": 0.0131, + "num_tokens": 15952529.0, + "reward": 0.37890625, + "reward_std": 0.4241042137145996, + "rewards/accuracy_reward/mean": 0.23046875, + "rewards/accuracy_reward/std": 0.42154473066329956, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.296875, + "rewards/soft_format_reward/std": 0.45732781291007996, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3620.0, + "completions/max_terminated_length": 3620.0, + "completions/mean_length": 729.07421875, + "completions/mean_terminated_length": 730.5009765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.08012703065836081, + "grad_norm": 1.8862161312826293, + "learning_rate": 7.692307692307693e-07, + "loss": 0.0119, + "num_tokens": 16390903.0, + "reward": 0.3154296875, + "reward_std": 0.3848440647125244, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.400390625, + "rewards/soft_format_reward/std": 0.4904567301273346, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6653.0, + "completions/max_terminated_length": 6653.0, + "completions/mean_length": 727.8984375, + "completions/mean_terminated_length": 729.3228759765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.08208134847929645, + "grad_norm": 6.451471591382041, + "learning_rate": 7.884615384615384e-07, + "loss": 0.0262, + "num_tokens": 16824963.0, + "reward": 0.3681640625, + "reward_std": 0.35936784744262695, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.458984375, + "rewards/soft_format_reward/std": 0.49880221486091614, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6943.0, + "completions/max_terminated_length": 6943.0, + "completions/mean_length": 750.8828125, + "completions/mean_terminated_length": 750.8828125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.08403566630023207, + "grad_norm": 15.63795334916065, + "learning_rate": 8.076923076923077e-07, + "loss": 0.0286, + "num_tokens": 17264359.0, + "reward": 0.431640625, + "reward_std": 0.4293968677520752, + "rewards/accuracy_reward/mean": 0.185546875, + "rewards/accuracy_reward/std": 0.38912075757980347, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.4921875, + "rewards/soft_format_reward/std": 0.5004279017448425, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4261.0, + "completions/max_terminated_length": 4261.0, + "completions/mean_length": 707.099609375, + "completions/mean_terminated_length": 707.099609375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.0859899841211677, + "grad_norm": 7.737867057880805, + "learning_rate": 8.269230769230768e-07, + "loss": 0.0307, + "num_tokens": 17690314.0, + "reward": 0.3916015625, + "reward_std": 0.34032970666885376, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.587890625, + "rewards/soft_format_reward/std": 0.49269601702690125, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7145.0, + "completions/max_terminated_length": 7145.0, + "completions/mean_length": 742.375, + "completions/mean_terminated_length": 743.8277587890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.08794430194210333, + "grad_norm": 0.6550011367505887, + "learning_rate": 8.461538461538461e-07, + "loss": 0.0697, + "num_tokens": 18135850.0, + "reward": 0.35546875, + "reward_std": 0.3161723017692566, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.5703125, + "rewards/soft_format_reward/std": 0.4955156147480011, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6386.0, + "completions/max_terminated_length": 6386.0, + "completions/mean_length": 696.259765625, + "completions/mean_terminated_length": 696.259765625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.08989861976303896, + "grad_norm": 11.795890440025957, + "learning_rate": 8.653846153846154e-07, + "loss": 0.0512, + "num_tokens": 18558175.0, + "reward": 0.4853515625, + "reward_std": 0.3495202660560608, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.701171875, + "rewards/soft_format_reward/std": 0.45819199085235596, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8076.0, + "completions/max_terminated_length": 8076.0, + "completions/mean_length": 641.501953125, + "completions/mean_terminated_length": 641.501953125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.0918529375839746, + "grad_norm": 6.744629282719349, + "learning_rate": 8.846153846153846e-07, + "loss": 0.03, + "num_tokens": 18943648.0, + "reward": 0.5244140625, + "reward_std": 0.37703123688697815, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.720703125, + "rewards/soft_format_reward/std": 0.44909247756004333, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7641.0, + "completions/max_terminated_length": 7641.0, + "completions/mean_length": 692.2890625, + "completions/mean_terminated_length": 692.2890625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.09380725540491022, + "grad_norm": 0.49832516126669424, + "learning_rate": 9.038461538461538e-07, + "loss": 0.0323, + "num_tokens": 19358388.0, + "reward": 0.5791015625, + "reward_std": 0.3485424518585205, + "rewards/accuracy_reward/mean": 0.17578125, + "rewards/accuracy_reward/std": 0.3810062110424042, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.806640625, + "rewards/soft_format_reward/std": 0.39531853795051575, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7011.0, + "completions/max_terminated_length": 7011.0, + "completions/mean_length": 680.265625, + "completions/mean_terminated_length": 680.265625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.09576157322584586, + "grad_norm": 0.5276664042949974, + "learning_rate": 9.230769230769231e-07, + "loss": 0.0306, + "num_tokens": 19762188.0, + "reward": 0.5830078125, + "reward_std": 0.33912643790245056, + "rewards/accuracy_reward/mean": 0.16015625, + "rewards/accuracy_reward/std": 0.3671095669269562, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.845703125, + "rewards/soft_format_reward/std": 0.36158639192581177, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5338.0, + "completions/max_terminated_length": 5338.0, + "completions/mean_length": 701.224609375, + "completions/mean_terminated_length": 701.224609375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.09771589104678148, + "grad_norm": 0.398530583083826, + "learning_rate": 9.423076923076923e-07, + "loss": 0.0292, + "num_tokens": 20204255.0, + "reward": 0.5439453125, + "reward_std": 0.2799522578716278, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.869140625, + "rewards/soft_format_reward/std": 0.33757632970809937, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6869.0, + "completions/max_terminated_length": 6869.0, + "completions/mean_length": 630.107421875, + "completions/mean_terminated_length": 630.107421875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.09967020886771712, + "grad_norm": 0.756981742184372, + "learning_rate": 9.615384615384615e-07, + "loss": 0.0053, + "num_tokens": 20609574.0, + "reward": 0.666015625, + "reward_std": 0.3723810315132141, + "rewards/accuracy_reward/mean": 0.212890625, + "rewards/accuracy_reward/std": 0.409751296043396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.90625, + "rewards/soft_format_reward/std": 0.29176566004753113, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7702.0, + "completions/max_terminated_length": 7702.0, + "completions/mean_length": 634.169921875, + "completions/mean_terminated_length": 634.169921875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.10162452668865274, + "grad_norm": 0.5098035893058888, + "learning_rate": 9.807692307692306e-07, + "loss": 0.0414, + "num_tokens": 21029805.0, + "reward": 0.6962890625, + "reward_std": 0.37402695417404175, + "rewards/accuracy_reward/mean": 0.236328125, + "rewards/accuracy_reward/std": 0.42524150013923645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.919921875, + "rewards/soft_format_reward/std": 0.271679550409317, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3695.0, + "completions/max_terminated_length": 3695.0, + "completions/mean_length": 615.41796875, + "completions/mean_terminated_length": 615.41796875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.10357884450958837, + "grad_norm": 1.4436174280591474, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 21417219.0, + "reward": 0.6953125, + "reward_std": 0.37299060821533203, + "rewards/accuracy_reward/mean": 0.240234375, + "rewards/accuracy_reward/std": 0.4276435375213623, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.91015625, + "rewards/soft_format_reward/std": 0.2862374484539032, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6882.0, + "completions/max_terminated_length": 6882.0, + "completions/mean_length": 702.88671875, + "completions/mean_terminated_length": 702.88671875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.105533162330524, + "grad_norm": 10.904435680320782, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 21844153.0, + "reward": 0.728515625, + "reward_std": 0.38241422176361084, + "rewards/accuracy_reward/mean": 0.263671875, + "rewards/accuracy_reward/std": 0.4410543739795685, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9296875, + "rewards/soft_format_reward/std": 0.25592297315597534, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3676.0, + "completions/max_terminated_length": 3676.0, + "completions/mean_length": 719.375, + "completions/mean_terminated_length": 719.375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.10748748015145963, + "grad_norm": 3.737914571876055, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 22280169.0, + "reward": 0.7099609375, + "reward_std": 0.3053381145000458, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4190165400505066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.966796875, + "rewards/soft_format_reward/std": 0.17934183776378632, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2744.0, + "completions/max_terminated_length": 2744.0, + "completions/mean_length": 667.369140625, + "completions/mean_terminated_length": 667.369140625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.10944179797239525, + "grad_norm": 1.923875905136729, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 22696006.0, + "reward": 0.6923828125, + "reward_std": 0.2837657928466797, + "rewards/accuracy_reward/mean": 0.212890625, + "rewards/accuracy_reward/std": 0.409751296043396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.958984375, + "rewards/soft_format_reward/std": 0.19852031767368317, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4236.0, + "completions/max_terminated_length": 4236.0, + "completions/mean_length": 715.671875, + "completions/mean_terminated_length": 715.671875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.11139611579333089, + "grad_norm": 0.4626364000567972, + "learning_rate": 1e-06, + "loss": 0.0287, + "num_tokens": 23131950.0, + "reward": 0.66796875, + "reward_std": 0.31913653016090393, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9296875, + "rewards/soft_format_reward/std": 0.25592297315597534, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6450.0, + "completions/max_terminated_length": 6450.0, + "completions/mean_length": 761.703125, + "completions/mean_terminated_length": 761.703125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.11335043361426653, + "grad_norm": 0.5050300600449099, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 23602854.0, + "reward": 0.6103515625, + "reward_std": 0.24192854762077332, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.970703125, + "rewards/soft_format_reward/std": 0.16880230605602264, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4924.0, + "completions/max_terminated_length": 4924.0, + "completions/mean_length": 669.611328125, + "completions/mean_terminated_length": 669.611328125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.11530475143520215, + "grad_norm": 2.478364651754757, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 24017823.0, + "reward": 0.7666015625, + "reward_std": 0.3241463899612427, + "rewards/accuracy_reward/mean": 0.279296875, + "rewards/accuracy_reward/std": 0.44909247756004333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.974609375, + "rewards/soft_format_reward/std": 0.15746226906776428, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2975.0, + "completions/max_terminated_length": 2975.0, + "completions/mean_length": 575.0, + "completions/mean_terminated_length": 575.0, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.11725906925613779, + "grad_norm": 1.2667258807616255, + "learning_rate": 1e-06, + "loss": -0.019, + "num_tokens": 24393055.0, + "reward": 0.9130859375, + "reward_std": 0.38359910249710083, + "rewards/accuracy_reward/mean": 0.41796875, + "rewards/accuracy_reward/std": 0.4937073290348053, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2032.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 646.26953125, + "completions/mean_terminated_length": 646.26953125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.11921338707707341, + "grad_norm": 0.9763668567929443, + "learning_rate": 1e-06, + "loss": -0.0147, + "num_tokens": 24823897.0, + "reward": 0.8505859375, + "reward_std": 0.3384297490119934, + "rewards/accuracy_reward/mean": 0.353515625, + "rewards/accuracy_reward/std": 0.47852855920791626, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3732.0, + "completions/max_terminated_length": 3732.0, + "completions/mean_length": 640.3125, + "completions/mean_terminated_length": 640.3125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.12116770489800904, + "grad_norm": 1.7368619279090036, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 25239497.0, + "reward": 0.7890625, + "reward_std": 0.31373071670532227, + "rewards/accuracy_reward/mean": 0.29296875, + "rewards/accuracy_reward/std": 0.455569326877594, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3270.0, + "completions/max_terminated_length": 3270.0, + "completions/mean_length": 713.025390625, + "completions/mean_terminated_length": 713.025390625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.12312202271894467, + "grad_norm": 1.8280418486012862, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 25688182.0, + "reward": 0.865234375, + "reward_std": 0.3824591338634491, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.48250964283943176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3558.0, + "completions/max_terminated_length": 3558.0, + "completions/mean_length": 677.810546875, + "completions/mean_terminated_length": 679.136962890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.1250763405398803, + "grad_norm": 0.3259564251559098, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 26117061.0, + "reward": 0.9404296875, + "reward_std": 0.3696586489677429, + "rewards/accuracy_reward/mean": 0.443359375, + "rewards/accuracy_reward/std": 0.49726733565330505, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4297.0, + "completions/max_terminated_length": 4297.0, + "completions/mean_length": 648.72265625, + "completions/mean_terminated_length": 648.72265625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.12703065836081592, + "grad_norm": 0.33301180327596414, + "learning_rate": 1e-06, + "loss": 0.0319, + "num_tokens": 26505255.0, + "reward": 0.884765625, + "reward_std": 0.35973697900772095, + "rewards/accuracy_reward/mean": 0.388671875, + "rewards/accuracy_reward/std": 0.4879252314567566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4946.0, + "completions/max_terminated_length": 4946.0, + "completions/mean_length": 685.53515625, + "completions/mean_terminated_length": 685.53515625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.12898497618175156, + "grad_norm": 0.5560188336338364, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 26926297.0, + "reward": 0.923828125, + "reward_std": 0.41401469707489014, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1295.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 544.0078125, + "completions/mean_terminated_length": 544.0078125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.1309392940026872, + "grad_norm": 0.35907561018393463, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 27278781.0, + "reward": 1.060546875, + "reward_std": 0.3138608932495117, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49656352400779724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1897.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 615.921875, + "completions/mean_terminated_length": 615.921875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1328936118236228, + "grad_norm": 0.4176299596931847, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 27666773.0, + "reward": 1.0859375, + "reward_std": 0.396852970123291, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49304109811782837, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4510.0, + "completions/max_terminated_length": 4510.0, + "completions/mean_length": 706.3515625, + "completions/mean_terminated_length": 706.3515625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.13484792964455844, + "grad_norm": 0.37450053231165964, + "learning_rate": 1e-06, + "loss": 0.0322, + "num_tokens": 28095673.0, + "reward": 0.9833984375, + "reward_std": 0.3956748843193054, + "rewards/accuracy_reward/mean": 0.490234375, + "rewards/accuracy_reward/std": 0.5003935098648071, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4797.0, + "completions/max_terminated_length": 4797.0, + "completions/mean_length": 688.916015625, + "completions/mean_terminated_length": 688.916015625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.13680224746549408, + "grad_norm": 0.3923203402640482, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 28521662.0, + "reward": 0.9921875, + "reward_std": 0.37730276584625244, + "rewards/accuracy_reward/mean": 0.49609375, + "rewards/accuracy_reward/std": 0.5004737377166748, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4396.0, + "completions/max_terminated_length": 4396.0, + "completions/mean_length": 786.294921875, + "completions/mean_terminated_length": 786.294921875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.1387565652864297, + "grad_norm": 0.3069302931855473, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 28985189.0, + "reward": 0.9912109375, + "reward_std": 0.2845304012298584, + "rewards/accuracy_reward/mean": 0.49609375, + "rewards/accuracy_reward/std": 0.5004737377166748, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4738.0, + "completions/max_terminated_length": 4738.0, + "completions/mean_length": 794.650390625, + "completions/mean_terminated_length": 794.650390625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.14071088310736535, + "grad_norm": 0.2977844359235718, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 29450786.0, + "reward": 0.8681640625, + "reward_std": 0.35338905453681946, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4845963716506958, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4796.0, + "completions/max_terminated_length": 4796.0, + "completions/mean_length": 618.41796875, + "completions/mean_terminated_length": 618.41796875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.14266520092830096, + "grad_norm": 0.3997809481235959, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 29830024.0, + "reward": 1.095703125, + "reward_std": 0.3584830164909363, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4900552034378052, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4208.0, + "completions/max_terminated_length": 4208.0, + "completions/mean_length": 643.44921875, + "completions/mean_terminated_length": 643.44921875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.1446195187492366, + "grad_norm": 0.3572820278721409, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 30215022.0, + "reward": 1.0712890625, + "reward_std": 0.33012235164642334, + "rewards/accuracy_reward/mean": 0.57421875, + "rewards/accuracy_reward/std": 0.4949444830417633, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4450.0, + "completions/max_terminated_length": 4450.0, + "completions/mean_length": 741.4609375, + "completions/mean_terminated_length": 741.4609375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.14657383657017223, + "grad_norm": 0.2737951536443479, + "learning_rate": 1e-06, + "loss": 0.0318, + "num_tokens": 30659226.0, + "reward": 0.994140625, + "reward_std": 0.24682985246181488, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5004889965057373, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4175.0, + "completions/max_terminated_length": 4175.0, + "completions/mean_length": 844.029296875, + "completions/mean_terminated_length": 844.029296875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.14852815439110786, + "grad_norm": 0.3161065576024958, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 31155273.0, + "reward": 0.8037109375, + "reward_std": 0.291909784078598, + "rewards/accuracy_reward/mean": 0.30859375, + "rewards/accuracy_reward/std": 0.4623647928237915, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5364.0, + "completions/max_terminated_length": 5364.0, + "completions/mean_length": 838.318359375, + "completions/mean_terminated_length": 838.318359375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.15048247221204347, + "grad_norm": 0.34070879844879426, + "learning_rate": 1e-06, + "loss": 0.0297, + "num_tokens": 31646172.0, + "reward": 0.8525390625, + "reward_std": 0.2849617898464203, + "rewards/accuracy_reward/mean": 0.361328125, + "rewards/accuracy_reward/std": 0.48085519671440125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.982421875, + "rewards/soft_format_reward/std": 0.13154059648513794, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4168.0, + "completions/max_terminated_length": 4168.0, + "completions/mean_length": 761.470703125, + "completions/mean_terminated_length": 761.470703125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.1524367900329791, + "grad_norm": 0.33483519695212355, + "learning_rate": 1e-06, + "loss": 0.0288, + "num_tokens": 32092477.0, + "reward": 0.890625, + "reward_std": 0.34418681263923645, + "rewards/accuracy_reward/mean": 0.392578125, + "rewards/accuracy_reward/std": 0.4888018071651459, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5032.0, + "completions/max_terminated_length": 5032.0, + "completions/mean_length": 887.9765625, + "completions/mean_terminated_length": 887.9765625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.15439110785391474, + "grad_norm": 0.3103945153500299, + "learning_rate": 1e-06, + "loss": 0.0277, + "num_tokens": 32614529.0, + "reward": 0.82421875, + "reward_std": 0.3129928708076477, + "rewards/accuracy_reward/mean": 0.333984375, + "rewards/accuracy_reward/std": 0.47209542989730835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98046875, + "rewards/soft_format_reward/std": 0.1385180652141571, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3990.0, + "completions/max_terminated_length": 3990.0, + "completions/mean_length": 836.138671875, + "completions/mean_terminated_length": 836.138671875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.15634542567485038, + "grad_norm": 0.36799917387633996, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 33105240.0, + "reward": 0.88671875, + "reward_std": 0.294155091047287, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48836761713027954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5339.0, + "completions/max_terminated_length": 5339.0, + "completions/mean_length": 832.505859375, + "completions/mean_terminated_length": 832.505859375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.158299743495786, + "grad_norm": 0.3433978792670212, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 33594443.0, + "reward": 0.8408203125, + "reward_std": 0.2898126542568207, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4754233956336975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3839.0, + "completions/max_terminated_length": 3839.0, + "completions/mean_length": 829.314453125, + "completions/mean_terminated_length": 829.314453125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.16025406131672162, + "grad_norm": 0.3927879946383985, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 34079324.0, + "reward": 0.90234375, + "reward_std": 0.3342772126197815, + "rewards/accuracy_reward/mean": 0.404296875, + "rewards/accuracy_reward/std": 0.4912354052066803, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4702.0, + "completions/max_terminated_length": 4702.0, + "completions/mean_length": 779.54296875, + "completions/mean_terminated_length": 779.54296875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.16220837913765726, + "grad_norm": 0.4188859762955116, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 34538050.0, + "reward": 0.9873046875, + "reward_std": 0.3072289824485779, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5004279017448425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4880.0, + "completions/max_terminated_length": 4880.0, + "completions/mean_length": 578.962890625, + "completions/mean_terminated_length": 578.962890625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1641626969585929, + "grad_norm": 0.495413107807339, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 34896063.0, + "reward": 1.06640625, + "reward_std": 0.3018982410430908, + "rewards/accuracy_reward/mean": 0.568359375, + "rewards/accuracy_reward/std": 0.4957893490791321, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2220.0, + "completions/max_terminated_length": 2220.0, + "completions/mean_length": 598.8359375, + "completions/mean_terminated_length": 598.8359375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.16611701477952853, + "grad_norm": 1.4089485788883784, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 35272587.0, + "reward": 1.158203125, + "reward_std": 0.28979820013046265, + "rewards/accuracy_reward/mean": 0.658203125, + "rewards/accuracy_reward/std": 0.4747757613658905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3947.0, + "completions/max_terminated_length": 3947.0, + "completions/mean_length": 647.265625, + "completions/mean_terminated_length": 647.265625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.16807133260046414, + "grad_norm": 0.5096085844315036, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 35665059.0, + "reward": 1.0849609375, + "reward_std": 0.33338090777397156, + "rewards/accuracy_reward/mean": 0.587890625, + "rewards/accuracy_reward/std": 0.49269601702690125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4392.0, + "completions/max_terminated_length": 4392.0, + "completions/mean_length": 717.1484375, + "completions/mean_terminated_length": 717.1484375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.17002565042139978, + "grad_norm": 0.4458316711746706, + "learning_rate": 1e-06, + "loss": 0.0409, + "num_tokens": 36095615.0, + "reward": 1.1484375, + "reward_std": 0.2980383336544037, + "rewards/accuracy_reward/mean": 0.65234375, + "rewards/accuracy_reward/std": 0.47669193148612976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5725.0, + "completions/max_terminated_length": 5725.0, + "completions/mean_length": 657.28515625, + "completions/mean_terminated_length": 657.28515625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.1719799682423354, + "grad_norm": 0.406080441051074, + "learning_rate": 1e-06, + "loss": 0.0372, + "num_tokens": 36508305.0, + "reward": 1.0830078125, + "reward_std": 0.29227039217948914, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49304109811782837, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4748.0, + "completions/max_terminated_length": 4748.0, + "completions/mean_length": 631.98828125, + "completions/mean_terminated_length": 631.98828125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.17393428606327105, + "grad_norm": 0.55104982258479, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 36891451.0, + "reward": 1.142578125, + "reward_std": 0.333560585975647, + "rewards/accuracy_reward/mean": 0.64453125, + "rewards/accuracy_reward/std": 0.47912323474884033, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4624.0, + "completions/max_terminated_length": 4624.0, + "completions/mean_length": 659.78125, + "completions/mean_terminated_length": 659.78125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.17588860388420666, + "grad_norm": 0.4909120738291484, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 37290699.0, + "reward": 1.1044921875, + "reward_std": 0.35391879081726074, + "rewards/accuracy_reward/mean": 0.607421875, + "rewards/accuracy_reward/std": 0.4888018071651459, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2217.0, + "completions/max_terminated_length": 2217.0, + "completions/mean_length": 596.451171875, + "completions/mean_terminated_length": 596.451171875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.1778429217051423, + "grad_norm": 0.5663443236202059, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 37654642.0, + "reward": 1.15234375, + "reward_std": 0.26311245560646057, + "rewards/accuracy_reward/mean": 0.65234375, + "rewards/accuracy_reward/std": 0.47669193148612976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4362.0, + "completions/max_terminated_length": 4362.0, + "completions/mean_length": 576.95703125, + "completions/mean_terminated_length": 576.95703125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.17979723952607793, + "grad_norm": 0.6088814470925175, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 38002460.0, + "reward": 1.2451171875, + "reward_std": 0.31163787841796875, + "rewards/accuracy_reward/mean": 0.74609375, + "rewards/accuracy_reward/std": 0.43567025661468506, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4527.0, + "completions/max_terminated_length": 4527.0, + "completions/mean_length": 617.337890625, + "completions/mean_terminated_length": 617.337890625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.18175155734701356, + "grad_norm": 0.4769978996017094, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 38379609.0, + "reward": 1.1845703125, + "reward_std": 0.327314555644989, + "rewards/accuracy_reward/mean": 0.689453125, + "rewards/accuracy_reward/std": 0.46317005157470703, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4142.0, + "completions/max_terminated_length": 4142.0, + "completions/mean_length": 616.0390625, + "completions/mean_terminated_length": 616.0390625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.1837058751679492, + "grad_norm": 0.47579063226678087, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 38759901.0, + "reward": 1.1513671875, + "reward_std": 0.28064775466918945, + "rewards/accuracy_reward/mean": 0.65234375, + "rewards/accuracy_reward/std": 0.47669193148612976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4283.0, + "completions/max_terminated_length": 4283.0, + "completions/mean_length": 683.416015625, + "completions/mean_terminated_length": 683.416015625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.1856601929888848, + "grad_norm": 0.5759146821182537, + "learning_rate": 1e-06, + "loss": 0.0173, + "num_tokens": 39188946.0, + "reward": 1.0947265625, + "reward_std": 0.3013327717781067, + "rewards/accuracy_reward/mean": 0.59765625, + "rewards/accuracy_reward/std": 0.4908501207828522, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4291.0, + "completions/max_terminated_length": 4291.0, + "completions/mean_length": 706.779296875, + "completions/mean_terminated_length": 706.779296875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.18761451080982045, + "grad_norm": 0.48906489429112854, + "learning_rate": 1e-06, + "loss": 0.0373, + "num_tokens": 39613265.0, + "reward": 1.1201171875, + "reward_std": 0.3672279715538025, + "rewards/accuracy_reward/mean": 0.62109375, + "rewards/accuracy_reward/std": 0.4855891764163971, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3345.0, + "completions/max_terminated_length": 3345.0, + "completions/mean_length": 656.37890625, + "completions/mean_terminated_length": 656.37890625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.18956882863075608, + "grad_norm": 0.48300450988988175, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 40012147.0, + "reward": 1.048828125, + "reward_std": 0.2704191207885742, + "rewards/accuracy_reward/mean": 0.548828125, + "rewards/accuracy_reward/std": 0.498096764087677, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3655.0, + "completions/max_terminated_length": 3655.0, + "completions/mean_length": 579.62890625, + "completions/mean_terminated_length": 579.62890625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.19152314645169172, + "grad_norm": 0.5336429925214926, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 40385781.0, + "reward": 1.048828125, + "reward_std": 0.32563331723213196, + "rewards/accuracy_reward/mean": 0.552734375, + "rewards/accuracy_reward/std": 0.4976975917816162, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3604.0, + "completions/max_terminated_length": 3604.0, + "completions/mean_length": 578.455078125, + "completions/mean_terminated_length": 578.455078125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.19347746427262733, + "grad_norm": 0.5322411209459456, + "learning_rate": 1e-06, + "loss": 0.0505, + "num_tokens": 40740254.0, + "reward": 1.1796875, + "reward_std": 0.32885950803756714, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4670529365539551, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2622.0, + "completions/max_terminated_length": 2622.0, + "completions/mean_length": 601.45703125, + "completions/mean_terminated_length": 601.45703125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.19543178209356296, + "grad_norm": 0.4908983576351016, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 41106248.0, + "reward": 1.1796875, + "reward_std": 0.24825888872146606, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4670529365539551, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4306.0, + "completions/max_terminated_length": 4306.0, + "completions/mean_length": 739.62109375, + "completions/mean_terminated_length": 739.62109375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.1973860999144986, + "grad_norm": 0.45849934987254326, + "learning_rate": 1e-06, + "loss": 0.022, + "num_tokens": 41542966.0, + "reward": 1.1142578125, + "reward_std": 0.32380765676498413, + "rewards/accuracy_reward/mean": 0.6171875, + "rewards/accuracy_reward/std": 0.486548513174057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6708.0, + "completions/max_terminated_length": 6708.0, + "completions/mean_length": 657.826171875, + "completions/mean_terminated_length": 657.826171875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.19934041773543423, + "grad_norm": 0.5491490468834526, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 41962061.0, + "reward": 1.1103515625, + "reward_std": 0.3505961298942566, + "rewards/accuracy_reward/mean": 0.611328125, + "rewards/accuracy_reward/std": 0.4879252314567566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6782.0, + "completions/max_terminated_length": 6782.0, + "completions/mean_length": 660.41796875, + "completions/mean_terminated_length": 660.41796875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.20129473555636984, + "grad_norm": 0.38607391821906956, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 42363843.0, + "reward": 1.130859375, + "reward_std": 0.29166144132614136, + "rewards/accuracy_reward/mean": 0.630859375, + "rewards/accuracy_reward/std": 0.4830440282821655, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4334.0, + "completions/max_terminated_length": 4334.0, + "completions/mean_length": 720.955078125, + "completions/mean_terminated_length": 720.955078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.20324905337730548, + "grad_norm": 0.46975758283776303, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 42806252.0, + "reward": 1.083984375, + "reward_std": 0.3343064785003662, + "rewards/accuracy_reward/mean": 0.58984375, + "rewards/accuracy_reward/std": 0.49234291911125183, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5322.0, + "completions/max_terminated_length": 5322.0, + "completions/mean_length": 737.857421875, + "completions/mean_terminated_length": 737.857421875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.2052033711982411, + "grad_norm": 0.5433537816873932, + "learning_rate": 1e-06, + "loss": 0.0552, + "num_tokens": 43258499.0, + "reward": 1.0537109375, + "reward_std": 0.34272387623786926, + "rewards/accuracy_reward/mean": 0.556640625, + "rewards/accuracy_reward/std": 0.49726733565330505, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6099.0, + "completions/max_terminated_length": 6099.0, + "completions/mean_length": 722.546875, + "completions/mean_terminated_length": 722.546875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.20715768901917675, + "grad_norm": 0.4948515829930095, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 43700203.0, + "reward": 1.109375, + "reward_std": 0.35553398728370667, + "rewards/accuracy_reward/mean": 0.611328125, + "rewards/accuracy_reward/std": 0.4879252314567566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5345.0, + "completions/max_terminated_length": 5345.0, + "completions/mean_length": 776.197265625, + "completions/mean_terminated_length": 776.197265625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.20911200684011239, + "grad_norm": 0.4017732993469144, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 44175296.0, + "reward": 0.9892578125, + "reward_std": 0.31362634897232056, + "rewards/accuracy_reward/mean": 0.494140625, + "rewards/accuracy_reward/std": 0.5004546642303467, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4531.0, + "completions/max_terminated_length": 4531.0, + "completions/mean_length": 718.513671875, + "completions/mean_terminated_length": 718.513671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.211066324661048, + "grad_norm": 0.445236558059886, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 44619463.0, + "reward": 1.1416015625, + "reward_std": 0.3152064085006714, + "rewards/accuracy_reward/mean": 0.642578125, + "rewards/accuracy_reward/std": 0.4797092080116272, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5380.0, + "completions/max_terminated_length": 5380.0, + "completions/mean_length": 737.1796875, + "completions/mean_terminated_length": 738.622314453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.21302064248198363, + "grad_norm": 0.44425335111657727, + "learning_rate": 1e-06, + "loss": 0.0373, + "num_tokens": 45056179.0, + "reward": 1.1669921875, + "reward_std": 0.2877297103404999, + "rewards/accuracy_reward/mean": 0.677734375, + "rewards/accuracy_reward/std": 0.46780112385749817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.978515625, + "rewards/soft_format_reward/std": 0.14513419568538666, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3749.0, + "completions/max_terminated_length": 3749.0, + "completions/mean_length": 681.171875, + "completions/mean_terminated_length": 681.171875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.21497496030291927, + "grad_norm": 0.4678600140080484, + "learning_rate": 1e-06, + "loss": 0.0419, + "num_tokens": 45493995.0, + "reward": 1.04296875, + "reward_std": 0.3182796835899353, + "rewards/accuracy_reward/mean": 0.548828125, + "rewards/accuracy_reward/std": 0.498096764087677, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7866.0, + "completions/max_terminated_length": 7866.0, + "completions/mean_length": 694.953125, + "completions/mean_terminated_length": 694.953125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.2169292781238549, + "grad_norm": 0.44578045283772877, + "learning_rate": 1e-06, + "loss": 0.0522, + "num_tokens": 45920291.0, + "reward": 1.146484375, + "reward_std": 0.2866002917289734, + "rewards/accuracy_reward/mean": 0.654296875, + "rewards/accuracy_reward/std": 0.4760620892047882, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6940.0, + "completions/max_terminated_length": 6940.0, + "completions/mean_length": 773.4609375, + "completions/mean_terminated_length": 773.4609375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.2188835959447905, + "grad_norm": 0.45299601259932204, + "learning_rate": 1e-06, + "loss": 0.0506, + "num_tokens": 46393759.0, + "reward": 1.1416015625, + "reward_std": 0.3179640769958496, + "rewards/accuracy_reward/mean": 0.646484375, + "rewards/accuracy_reward/std": 0.47852855920791626, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4527.0, + "completions/max_terminated_length": 4527.0, + "completions/mean_length": 671.703125, + "completions/mean_terminated_length": 671.703125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.22083791376572615, + "grad_norm": 0.3490639277596978, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 46795895.0, + "reward": 1.1376953125, + "reward_std": 0.2308749556541443, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.48028653860092163, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4731.0, + "completions/max_terminated_length": 4731.0, + "completions/mean_length": 788.478515625, + "completions/mean_terminated_length": 788.478515625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.22279223158666178, + "grad_norm": 0.457654825691392, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 47272060.0, + "reward": 0.99609375, + "reward_std": 0.3291885256767273, + "rewards/accuracy_reward/mean": 0.50390625, + "rewards/accuracy_reward/std": 0.5004737377166748, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3701.0, + "completions/max_terminated_length": 3701.0, + "completions/mean_length": 750.9921875, + "completions/mean_terminated_length": 750.9921875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.22474654940759742, + "grad_norm": 0.35434488032198186, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 47719320.0, + "reward": 1.2119140625, + "reward_std": 0.2785128951072693, + "rewards/accuracy_reward/mean": 0.712890625, + "rewards/accuracy_reward/std": 0.45285552740097046, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3672.0, + "completions/max_terminated_length": 3672.0, + "completions/mean_length": 752.19921875, + "completions/mean_terminated_length": 752.19921875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.22670086722853305, + "grad_norm": 0.409524691728144, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 48164974.0, + "reward": 1.08203125, + "reward_std": 0.2868836522102356, + "rewards/accuracy_reward/mean": 0.583984375, + "rewards/accuracy_reward/std": 0.493378221988678, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4533.0, + "completions/max_terminated_length": 4533.0, + "completions/mean_length": 678.97265625, + "completions/mean_terminated_length": 680.3013916015625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.22865518504946866, + "grad_norm": 0.43588535858806265, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 48589536.0, + "reward": 1.150390625, + "reward_std": 0.27419424057006836, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4754233956336975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3938.0, + "completions/max_terminated_length": 3938.0, + "completions/mean_length": 661.53515625, + "completions/mean_terminated_length": 661.53515625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.2306095028704043, + "grad_norm": 0.3900342305505407, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 49003842.0, + "reward": 1.1318359375, + "reward_std": 0.24747905135154724, + "rewards/accuracy_reward/mean": 0.63671875, + "rewards/accuracy_reward/std": 0.4814152419567108, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4017.0, + "completions/max_terminated_length": 4017.0, + "completions/mean_length": 739.3828125, + "completions/mean_terminated_length": 740.8297119140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.23256382069133993, + "grad_norm": 0.3413704993926388, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 49438534.0, + "reward": 1.224609375, + "reward_std": 0.27179744839668274, + "rewards/accuracy_reward/mean": 0.7265625, + "rewards/accuracy_reward/std": 0.4461594223976135, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 3898.0, + "completions/mean_length": 805.703125, + "completions/mean_terminated_length": 805.703125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.23451813851227557, + "grad_norm": 0.3317510522874492, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 49920862.0, + "reward": 0.9501953125, + "reward_std": 0.33403074741363525, + "rewards/accuracy_reward/mean": 0.451171875, + "rewards/accuracy_reward/std": 0.498096764087677, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6453.0, + "completions/max_terminated_length": 6453.0, + "completions/mean_length": 914.771484375, + "completions/mean_terminated_length": 914.771484375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.23647245633321118, + "grad_norm": 0.2854485976716137, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 50463145.0, + "reward": 0.7646484375, + "reward_std": 0.29717278480529785, + "rewards/accuracy_reward/mean": 0.26953125, + "rewards/accuracy_reward/std": 0.44415023922920227, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4101.0, + "completions/max_terminated_length": 4101.0, + "completions/mean_length": 971.4765625, + "completions/mean_terminated_length": 971.4765625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.23842677415414681, + "grad_norm": 0.2531475675187165, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 51032045.0, + "reward": 0.8505859375, + "reward_std": 0.31238460540771484, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4779251217842102, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4381.0, + "completions/max_terminated_length": 4381.0, + "completions/mean_length": 925.8984375, + "completions/mean_terminated_length": 925.8984375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.24038109197508245, + "grad_norm": 0.2994115993779575, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 51581577.0, + "reward": 0.8115234375, + "reward_std": 0.3205258250236511, + "rewards/accuracy_reward/mean": 0.314453125, + "rewards/accuracy_reward/std": 0.4647517800331116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3908.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 897.849609375, + "completions/mean_terminated_length": 897.849609375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.2423354097960181, + "grad_norm": 0.30541273874376906, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 52109228.0, + "reward": 0.923828125, + "reward_std": 0.3343881368637085, + "rewards/accuracy_reward/mean": 0.42578125, + "rewards/accuracy_reward/std": 0.4949444830417633, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4689.0, + "completions/max_terminated_length": 4689.0, + "completions/mean_length": 904.763671875, + "completions/mean_terminated_length": 904.763671875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.2442897276169537, + "grad_norm": 0.3585479438208608, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 52641587.0, + "reward": 0.8818359375, + "reward_std": 0.4076825976371765, + "rewards/accuracy_reward/mean": 0.38671875, + "rewards/accuracy_reward/std": 0.48747459053993225, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4396.0, + "completions/max_terminated_length": 4396.0, + "completions/mean_length": 899.87109375, + "completions/mean_terminated_length": 899.87109375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.24624404543788933, + "grad_norm": 0.29001670614637737, + "learning_rate": 1e-06, + "loss": 0.0387, + "num_tokens": 53164353.0, + "reward": 0.8564453125, + "reward_std": 0.37283623218536377, + "rewards/accuracy_reward/mean": 0.361328125, + "rewards/accuracy_reward/std": 0.48085519671440125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4774.0, + "completions/max_terminated_length": 4774.0, + "completions/mean_length": 883.484375, + "completions/mean_terminated_length": 883.484375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.24819836325882497, + "grad_norm": 0.3242495382124342, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 53685625.0, + "reward": 0.798828125, + "reward_std": 0.3719630837440491, + "rewards/accuracy_reward/mean": 0.302734375, + "rewards/accuracy_reward/std": 0.45989060401916504, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3983.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 991.638671875, + "completions/mean_terminated_length": 991.638671875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.2501526810797606, + "grad_norm": 0.2877612798283809, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 54264496.0, + "reward": 0.8115234375, + "reward_std": 0.3776240944862366, + "rewards/accuracy_reward/mean": 0.314453125, + "rewards/accuracy_reward/std": 0.4647517800331116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3986.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 1007.052734375, + "completions/mean_terminated_length": 1007.052734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.2521069989006962, + "grad_norm": 0.24006539805528612, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 54855483.0, + "reward": 0.82421875, + "reward_std": 0.3028924763202667, + "rewards/accuracy_reward/mean": 0.326171875, + "rewards/accuracy_reward/std": 0.4692695140838623, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4004.0, + "completions/max_terminated_length": 4004.0, + "completions/mean_length": 943.00390625, + "completions/mean_terminated_length": 943.00390625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.25406131672163185, + "grad_norm": 0.30221140687323506, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 55417197.0, + "reward": 0.9287109375, + "reward_std": 0.3796229362487793, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3776.0, + "completions/max_terminated_length": 3776.0, + "completions/mean_length": 1003.65625, + "completions/mean_terminated_length": 1003.65625, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.2560156345425675, + "grad_norm": 0.24079354258326716, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 56008861.0, + "reward": 0.9140625, + "reward_std": 0.35439926385879517, + "rewards/accuracy_reward/mean": 0.41796875, + "rewards/accuracy_reward/std": 0.4937073290348053, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4727.0, + "completions/max_terminated_length": 4727.0, + "completions/mean_length": 947.72265625, + "completions/mean_terminated_length": 947.72265625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.2579699523635031, + "grad_norm": 0.2780611906450834, + "learning_rate": 1e-06, + "loss": 0.0332, + "num_tokens": 56566863.0, + "reward": 1.083984375, + "reward_std": 0.38542526960372925, + "rewards/accuracy_reward/mean": 0.587890625, + "rewards/accuracy_reward/std": 0.49269601702690125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4811.0, + "completions/max_terminated_length": 4811.0, + "completions/mean_length": 947.9609375, + "completions/mean_terminated_length": 947.9609375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.25992427018443875, + "grad_norm": 0.2607724152051217, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 57127227.0, + "reward": 0.9267578125, + "reward_std": 0.3148196339607239, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 3937.0, + "completions/mean_length": 1053.130859375, + "completions/mean_terminated_length": 1053.130859375, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.2618785880053744, + "grad_norm": 0.2562936958346042, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 57744430.0, + "reward": 0.83203125, + "reward_std": 0.32906875014305115, + "rewards/accuracy_reward/mean": 0.333984375, + "rewards/accuracy_reward/std": 0.47209542989730835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3524.0, + "completions/max_terminated_length": 3524.0, + "completions/mean_length": 971.384765625, + "completions/mean_terminated_length": 971.384765625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.26383290582631, + "grad_norm": 0.27292794597962106, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 58331203.0, + "reward": 0.876953125, + "reward_std": 0.3299524188041687, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.486548513174057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4133.0, + "completions/max_terminated_length": 4133.0, + "completions/mean_length": 1041.1953125, + "completions/mean_terminated_length": 1041.1953125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.2657872236472456, + "grad_norm": 0.2718878450423647, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 58934487.0, + "reward": 0.9130859375, + "reward_std": 0.40916621685028076, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49304109811782837, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2653.0, + "completions/max_terminated_length": 2653.0, + "completions/mean_length": 890.697265625, + "completions/mean_terminated_length": 890.697265625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.26774154146818124, + "grad_norm": 0.3034988620262506, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 59456284.0, + "reward": 0.82421875, + "reward_std": 0.32018736004829407, + "rewards/accuracy_reward/mean": 0.32421875, + "rewards/accuracy_reward/std": 0.4685399830341339, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4584.0, + "completions/max_terminated_length": 4584.0, + "completions/mean_length": 975.54296875, + "completions/mean_terminated_length": 975.54296875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.2696958592891169, + "grad_norm": 0.2973070974296242, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 60037522.0, + "reward": 0.8720703125, + "reward_std": 0.326847106218338, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4845963716506958, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3978.0, + "completions/max_terminated_length": 3978.0, + "completions/mean_length": 988.076171875, + "completions/mean_terminated_length": 988.076171875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.2716501771100525, + "grad_norm": 0.2753457287958741, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 60616809.0, + "reward": 0.9404296875, + "reward_std": 0.37956422567367554, + "rewards/accuracy_reward/mean": 0.443359375, + "rewards/accuracy_reward/std": 0.49726733565330505, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4181.0, + "completions/max_terminated_length": 4181.0, + "completions/mean_length": 1007.048828125, + "completions/mean_terminated_length": 1007.048828125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.27360449493098815, + "grad_norm": 0.28360950511157235, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 61207730.0, + "reward": 0.830078125, + "reward_std": 0.3750539720058441, + "rewards/accuracy_reward/mean": 0.33203125, + "rewards/accuracy_reward/std": 0.47140273451805115, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6820.0, + "completions/max_terminated_length": 6820.0, + "completions/mean_length": 968.765625, + "completions/mean_terminated_length": 968.765625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.2755588127519238, + "grad_norm": 0.2762016288870139, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 61781562.0, + "reward": 0.8828125, + "reward_std": 0.32183653116226196, + "rewards/accuracy_reward/mean": 0.384765625, + "rewards/accuracy_reward/std": 0.4870156943798065, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4746.0, + "completions/max_terminated_length": 4746.0, + "completions/mean_length": 1011.654296875, + "completions/mean_terminated_length": 1011.654296875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.2775131305728594, + "grad_norm": 0.2821803519821113, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 62368921.0, + "reward": 0.912109375, + "reward_std": 0.3377918303012848, + "rewards/accuracy_reward/mean": 0.41796875, + "rewards/accuracy_reward/std": 0.4937073290348053, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4432.0, + "completions/max_terminated_length": 4432.0, + "completions/mean_length": 989.38671875, + "completions/mean_terminated_length": 989.38671875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.27946744839379506, + "grad_norm": 0.29075649075283083, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 62947919.0, + "reward": 0.8486328125, + "reward_std": 0.3569599986076355, + "rewards/accuracy_reward/mean": 0.353515625, + "rewards/accuracy_reward/std": 0.47852855920791626, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3366.0, + "completions/max_terminated_length": 3366.0, + "completions/mean_length": 990.04296875, + "completions/mean_terminated_length": 993.925537109375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.2814217662147307, + "grad_norm": 0.26914130080446463, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 63527893.0, + "reward": 0.974609375, + "reward_std": 0.2950121760368347, + "rewards/accuracy_reward/mean": 0.478515625, + "rewards/accuracy_reward/std": 0.5000267624855042, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3942.0, + "completions/max_terminated_length": 3942.0, + "completions/mean_length": 813.697265625, + "completions/mean_terminated_length": 813.697265625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.2833760840356663, + "grad_norm": 0.3348076969561777, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 64009130.0, + "reward": 1.095703125, + "reward_std": 0.28549301624298096, + "rewards/accuracy_reward/mean": 0.59765625, + "rewards/accuracy_reward/std": 0.4908501207828522, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5175.0, + "completions/max_terminated_length": 5175.0, + "completions/mean_length": 774.796875, + "completions/mean_terminated_length": 774.796875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.2853304018566019, + "grad_norm": 0.3216730964976698, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 64468034.0, + "reward": 1.140625, + "reward_std": 0.2770097255706787, + "rewards/accuracy_reward/mean": 0.642578125, + "rewards/accuracy_reward/std": 0.4797092080116272, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4444.0, + "completions/max_terminated_length": 4444.0, + "completions/mean_length": 810.03125, + "completions/mean_terminated_length": 810.03125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.28728471967753755, + "grad_norm": 0.35746738510442333, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 64945602.0, + "reward": 1.0146484375, + "reward_std": 0.34599173069000244, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5002445578575134, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4119.0, + "completions/max_terminated_length": 4119.0, + "completions/mean_length": 812.662109375, + "completions/mean_terminated_length": 812.662109375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.2892390374984732, + "grad_norm": 0.3021469148815593, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 65421173.0, + "reward": 1.0732421875, + "reward_std": 0.2631635069847107, + "rewards/accuracy_reward/mean": 0.576171875, + "rewards/accuracy_reward/std": 0.4946470856666565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4707.0, + "completions/max_terminated_length": 4707.0, + "completions/mean_length": 785.263671875, + "completions/mean_terminated_length": 785.263671875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.2911933553194088, + "grad_norm": 0.35030451020352243, + "learning_rate": 1e-06, + "loss": 0.0234, + "num_tokens": 65890076.0, + "reward": 0.9794921875, + "reward_std": 0.2742602825164795, + "rewards/accuracy_reward/mean": 0.48046875, + "rewards/accuracy_reward/std": 0.5001069903373718, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5320.0, + "completions/max_terminated_length": 5320.0, + "completions/mean_length": 741.138671875, + "completions/mean_terminated_length": 741.138671875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.29314767314034446, + "grad_norm": 0.38395492198270637, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 66330931.0, + "reward": 1.068359375, + "reward_std": 0.35822051763534546, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3975.0, + "completions/max_terminated_length": 3975.0, + "completions/mean_length": 977.92578125, + "completions/mean_terminated_length": 977.92578125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.2951019909612801, + "grad_norm": 0.3218756943899402, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 66900301.0, + "reward": 1.0517578125, + "reward_std": 0.29146334528923035, + "rewards/accuracy_reward/mean": 0.55859375, + "rewards/accuracy_reward/std": 0.4970405399799347, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2920.0, + "completions/max_terminated_length": 2920.0, + "completions/mean_length": 771.552734375, + "completions/mean_terminated_length": 771.552734375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.29705630878221573, + "grad_norm": 0.37871903406010093, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 67356824.0, + "reward": 1.0908203125, + "reward_std": 0.2862834334373474, + "rewards/accuracy_reward/mean": 0.591796875, + "rewards/accuracy_reward/std": 0.49198177456855774, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4896.0, + "completions/max_terminated_length": 4896.0, + "completions/mean_length": 787.67578125, + "completions/mean_terminated_length": 787.67578125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.29901062660315136, + "grad_norm": 0.3562407863492993, + "learning_rate": 1e-06, + "loss": 0.0308, + "num_tokens": 67822354.0, + "reward": 1.08984375, + "reward_std": 0.276405394077301, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49161264300346375, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3825.0, + "completions/max_terminated_length": 3825.0, + "completions/mean_length": 789.732421875, + "completions/mean_terminated_length": 789.732421875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.30096494442408694, + "grad_norm": 0.36332581753663423, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 68288265.0, + "reward": 1.0673828125, + "reward_std": 0.2996455430984497, + "rewards/accuracy_reward/mean": 0.568359375, + "rewards/accuracy_reward/std": 0.4957893490791321, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4548.0, + "completions/max_terminated_length": 4548.0, + "completions/mean_length": 887.986328125, + "completions/mean_terminated_length": 887.986328125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.3029192622450226, + "grad_norm": 0.4331837182377617, + "learning_rate": 1e-06, + "loss": 0.02, + "num_tokens": 68812402.0, + "reward": 1.0361328125, + "reward_std": 0.3580864369869232, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.4989593029022217, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3808.0, + "completions/max_terminated_length": 3808.0, + "completions/mean_length": 847.5, + "completions/mean_terminated_length": 847.5, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.3048735800659582, + "grad_norm": 0.38826242845126885, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 69322162.0, + "reward": 1.0390625, + "reward_std": 0.3159153163433075, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.4989593029022217, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4442.0, + "completions/max_terminated_length": 4442.0, + "completions/mean_length": 881.166015625, + "completions/mean_terminated_length": 881.166015625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.30682789788689385, + "grad_norm": 0.39315441723304445, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 69840359.0, + "reward": 1.0869140625, + "reward_std": 0.33045780658721924, + "rewards/accuracy_reward/mean": 0.587890625, + "rewards/accuracy_reward/std": 0.49269601702690125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4964.0, + "completions/max_terminated_length": 4964.0, + "completions/mean_length": 907.76953125, + "completions/mean_terminated_length": 907.76953125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.3087822157078295, + "grad_norm": 0.41776743845591824, + "learning_rate": 1e-06, + "loss": 0.0326, + "num_tokens": 70372513.0, + "reward": 1.1474609375, + "reward_std": 0.3249959647655487, + "rewards/accuracy_reward/mean": 0.65234375, + "rewards/accuracy_reward/std": 0.47669193148612976, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6065.0, + "completions/max_terminated_length": 6065.0, + "completions/mean_length": 987.37109375, + "completions/mean_terminated_length": 987.37109375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.3107365335287651, + "grad_norm": 0.3989205336784414, + "learning_rate": 1e-06, + "loss": 0.0511, + "num_tokens": 70939455.0, + "reward": 1.0361328125, + "reward_std": 0.387142151594162, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.4989593029022217, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4243.0, + "completions/max_terminated_length": 4243.0, + "completions/mean_length": 924.203125, + "completions/mean_terminated_length": 924.203125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.31269085134970076, + "grad_norm": 0.350184061608897, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 71479127.0, + "reward": 1.0078125, + "reward_std": 0.3427523076534271, + "rewards/accuracy_reward/mean": 0.509765625, + "rewards/accuracy_reward/std": 0.5003935098648071, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5306.0, + "completions/max_terminated_length": 5306.0, + "completions/mean_length": 902.58984375, + "completions/mean_terminated_length": 902.58984375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.3146451691706364, + "grad_norm": 0.34211107026389803, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 72007413.0, + "reward": 1.064453125, + "reward_std": 0.31646695733070374, + "rewards/accuracy_reward/mean": 0.56640625, + "rewards/accuracy_reward/std": 0.4960552453994751, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4431.0, + "completions/max_terminated_length": 4431.0, + "completions/mean_length": 981.595703125, + "completions/mean_terminated_length": 981.595703125, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.316599486991572, + "grad_norm": 0.3493883702957425, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 72572438.0, + "reward": 0.9541015625, + "reward_std": 0.3735593259334564, + "rewards/accuracy_reward/mean": 0.45703125, + "rewards/accuracy_reward/std": 0.49863746762275696, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3953.0, + "completions/max_terminated_length": 3953.0, + "completions/mean_length": 991.599609375, + "completions/mean_terminated_length": 991.599609375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.3185538048125076, + "grad_norm": 0.3337376992933025, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 73147481.0, + "reward": 0.8896484375, + "reward_std": 0.37190183997154236, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48836761713027954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2453.0, + "completions/max_terminated_length": 2453.0, + "completions/mean_length": 1021.818359375, + "completions/mean_terminated_length": 1021.818359375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.32050812263344325, + "grad_norm": 0.3194890237385626, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 73732828.0, + "reward": 1.080078125, + "reward_std": 0.37582653760910034, + "rewards/accuracy_reward/mean": 0.580078125, + "rewards/accuracy_reward/std": 0.4940285086631775, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3849.0, + "completions/max_terminated_length": 3849.0, + "completions/mean_length": 917.935546875, + "completions/mean_terminated_length": 917.935546875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.3224624404543789, + "grad_norm": 0.3554221819324511, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 74264299.0, + "reward": 1.1015625, + "reward_std": 0.3214406371116638, + "rewards/accuracy_reward/mean": 0.603515625, + "rewards/accuracy_reward/std": 0.4896455705165863, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3324.0, + "completions/max_terminated_length": 3324.0, + "completions/mean_length": 928.6796875, + "completions/mean_terminated_length": 930.4970703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3244167582753145, + "grad_norm": 0.34006857647510036, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 74801655.0, + "reward": 0.8857421875, + "reward_std": 0.3358905613422394, + "rewards/accuracy_reward/mean": 0.38671875, + "rewards/accuracy_reward/std": 0.48747459053993225, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2693.0, + "completions/max_terminated_length": 2693.0, + "completions/mean_length": 1069.9375, + "completions/mean_terminated_length": 1069.9375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.32637107609625016, + "grad_norm": 0.31628840763003907, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 75415991.0, + "reward": 1.0546875, + "reward_std": 0.3212948143482208, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.49748632311820984, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3562.0, + "completions/max_terminated_length": 3562.0, + "completions/mean_length": 1089.83984375, + "completions/mean_terminated_length": 1089.83984375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.3283253939171858, + "grad_norm": 0.32281369169950347, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 76046981.0, + "reward": 1.1640625, + "reward_std": 0.35284948348999023, + "rewards/accuracy_reward/mean": 0.666015625, + "rewards/accuracy_reward/std": 0.47209542989730835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2409.0, + "completions/max_terminated_length": 2409.0, + "completions/mean_length": 952.912109375, + "completions/mean_terminated_length": 952.912109375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.33027971173812143, + "grad_norm": 0.31449388152957247, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 76597384.0, + "reward": 1.06640625, + "reward_std": 0.2608098089694977, + "rewards/accuracy_reward/mean": 0.56640625, + "rewards/accuracy_reward/std": 0.4960552453994751, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2247.0, + "completions/max_terminated_length": 2247.0, + "completions/mean_length": 935.126953125, + "completions/mean_terminated_length": 935.126953125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.33223402955905706, + "grad_norm": 0.3848412543372846, + "learning_rate": 1e-06, + "loss": 0.0152, + "num_tokens": 77140633.0, + "reward": 1.09765625, + "reward_std": 0.39436060190200806, + "rewards/accuracy_reward/mean": 0.59765625, + "rewards/accuracy_reward/std": 0.4908501207828522, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5203.0, + "completions/max_terminated_length": 5203.0, + "completions/mean_length": 1010.025390625, + "completions/mean_terminated_length": 1010.025390625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.33418834737999265, + "grad_norm": 0.29997274963064524, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 77723094.0, + "reward": 1.044921875, + "reward_std": 0.3298400640487671, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4982847273349762, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2526.0, + "completions/max_terminated_length": 2526.0, + "completions/mean_length": 959.91015625, + "completions/mean_terminated_length": 959.91015625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.3361426652009283, + "grad_norm": 0.3149832154627408, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 78282856.0, + "reward": 0.97265625, + "reward_std": 0.3255975842475891, + "rewards/accuracy_reward/mean": 0.47265625, + "rewards/accuracy_reward/std": 0.49974003434181213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4560.0, + "completions/max_terminated_length": 4560.0, + "completions/mean_length": 930.896484375, + "completions/mean_terminated_length": 930.896484375, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.3380969830218639, + "grad_norm": 0.3802309589541985, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 78821907.0, + "reward": 0.9912109375, + "reward_std": 0.3749457895755768, + "rewards/accuracy_reward/mean": 0.494140625, + "rewards/accuracy_reward/std": 0.5004546642303467, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4765.0, + "completions/max_terminated_length": 4765.0, + "completions/mean_length": 914.41015625, + "completions/mean_terminated_length": 914.41015625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.34005130084279955, + "grad_norm": 0.2991567907546148, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 79351893.0, + "reward": 0.9951171875, + "reward_std": 0.24881646037101746, + "rewards/accuracy_reward/mean": 0.498046875, + "rewards/accuracy_reward/std": 0.5004851818084717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3300.0, + "completions/max_terminated_length": 3300.0, + "completions/mean_length": 902.923828125, + "completions/mean_terminated_length": 902.923828125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.3420056186637352, + "grad_norm": 0.3309114159349809, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 79878718.0, + "reward": 1.0146484375, + "reward_std": 0.31361550092697144, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5002445578575134, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2252.0, + "completions/max_terminated_length": 2252.0, + "completions/mean_length": 784.994140625, + "completions/mean_terminated_length": 784.994140625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3439599364846708, + "grad_norm": 0.3608391521176332, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 80338843.0, + "reward": 1.18359375, + "reward_std": 0.3492187261581421, + "rewards/accuracy_reward/mean": 0.68359375, + "rewards/accuracy_reward/std": 0.46552830934524536, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4562.0, + "completions/max_terminated_length": 4562.0, + "completions/mean_length": 963.783203125, + "completions/mean_terminated_length": 963.783203125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.34591425430560646, + "grad_norm": 0.25457494321220553, + "learning_rate": 1e-06, + "loss": 0.0427, + "num_tokens": 80897868.0, + "reward": 1.07421875, + "reward_std": 0.29132741689682007, + "rewards/accuracy_reward/mean": 0.580078125, + "rewards/accuracy_reward/std": 0.4940285086631775, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2685.0, + "completions/max_terminated_length": 2685.0, + "completions/mean_length": 888.890625, + "completions/mean_terminated_length": 888.890625, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.3478685721265421, + "grad_norm": 0.32522984568829905, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 81414484.0, + "reward": 1.1171875, + "reward_std": 0.30815669894218445, + "rewards/accuracy_reward/mean": 0.6171875, + "rewards/accuracy_reward/std": 0.486548513174057, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3985.0, + "completions/max_terminated_length": 3985.0, + "completions/mean_length": 929.34375, + "completions/mean_terminated_length": 929.34375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.34982288994747773, + "grad_norm": 0.31398798639485276, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 81951748.0, + "reward": 0.9501953125, + "reward_std": 0.33174121379852295, + "rewards/accuracy_reward/mean": 0.451171875, + "rewards/accuracy_reward/std": 0.498096764087677, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4900.0, + "completions/max_terminated_length": 4900.0, + "completions/mean_length": 1017.001953125, + "completions/mean_terminated_length": 1017.001953125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.3517772077684133, + "grad_norm": 0.2713176474276239, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 82534437.0, + "reward": 1.0361328125, + "reward_std": 0.3269115686416626, + "rewards/accuracy_reward/mean": 0.541015625, + "rewards/accuracy_reward/std": 0.49880221486091614, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4734.0, + "completions/max_terminated_length": 4734.0, + "completions/mean_length": 928.146484375, + "completions/mean_terminated_length": 928.146484375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.35373152558934895, + "grad_norm": 0.3298102228576332, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 83073984.0, + "reward": 1.0078125, + "reward_std": 0.36109158396720886, + "rewards/accuracy_reward/mean": 0.509765625, + "rewards/accuracy_reward/std": 0.5003935098648071, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4745.0, + "completions/max_terminated_length": 4745.0, + "completions/mean_length": 916.298828125, + "completions/mean_terminated_length": 916.298828125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.3556858434102846, + "grad_norm": 0.26524178591301956, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 83614985.0, + "reward": 1.048828125, + "reward_std": 0.24359315633773804, + "rewards/accuracy_reward/mean": 0.556640625, + "rewards/accuracy_reward/std": 0.49726733565330505, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4640.0, + "completions/max_terminated_length": 4640.0, + "completions/mean_length": 847.353515625, + "completions/mean_terminated_length": 847.353515625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.3576401612312202, + "grad_norm": 0.352275785990723, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 84113966.0, + "reward": 1.001953125, + "reward_std": 0.34775465726852417, + "rewards/accuracy_reward/mean": 0.50390625, + "rewards/accuracy_reward/std": 0.5004737377166748, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4716.0, + "completions/max_terminated_length": 4716.0, + "completions/mean_length": 747.056640625, + "completions/mean_terminated_length": 747.056640625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.35959447905215586, + "grad_norm": 0.3315612986744808, + "learning_rate": 1e-06, + "loss": 0.0302, + "num_tokens": 84557979.0, + "reward": 1.2568359375, + "reward_std": 0.2530122399330139, + "rewards/accuracy_reward/mean": 0.76171875, + "rewards/accuracy_reward/std": 0.42644867300987244, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3683.0, + "completions/max_terminated_length": 3683.0, + "completions/mean_length": 647.302734375, + "completions/mean_terminated_length": 648.5694580078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3615487968730915, + "grad_norm": 0.32470004440811306, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 84951350.0, + "reward": 1.140625, + "reward_std": 0.27286389470100403, + "rewards/accuracy_reward/mean": 0.642578125, + "rewards/accuracy_reward/std": 0.4797092080116272, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3089.0, + "completions/max_terminated_length": 3089.0, + "completions/mean_length": 841.55859375, + "completions/mean_terminated_length": 841.55859375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.36350311469402713, + "grad_norm": 0.30662615212256744, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 85444148.0, + "reward": 1.208984375, + "reward_std": 0.33242520689964294, + "rewards/accuracy_reward/mean": 0.7109375, + "rewards/accuracy_reward/std": 0.45377036929130554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4490.0, + "completions/max_terminated_length": 4490.0, + "completions/mean_length": 867.587890625, + "completions/mean_terminated_length": 867.587890625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.36545743251496277, + "grad_norm": 0.3423355132384457, + "learning_rate": 1e-06, + "loss": 0.0484, + "num_tokens": 85951409.0, + "reward": 1.08203125, + "reward_std": 0.3645339906215668, + "rewards/accuracy_reward/mean": 0.583984375, + "rewards/accuracy_reward/std": 0.493378221988678, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3864.0, + "completions/max_terminated_length": 3864.0, + "completions/mean_length": 881.30078125, + "completions/mean_terminated_length": 881.30078125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3674117503358984, + "grad_norm": 0.2775479054416856, + "learning_rate": 1e-06, + "loss": 0.0365, + "num_tokens": 86468123.0, + "reward": 1.1591796875, + "reward_std": 0.252810001373291, + "rewards/accuracy_reward/mean": 0.662109375, + "rewards/accuracy_reward/std": 0.4734536409378052, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4680.0, + "completions/max_terminated_length": 4680.0, + "completions/mean_length": 798.083984375, + "completions/mean_terminated_length": 798.083984375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.369366068156834, + "grad_norm": 0.2885160242803324, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 86937302.0, + "reward": 1.1767578125, + "reward_std": 0.2531892657279968, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4670529365539551, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3806.0, + "completions/max_terminated_length": 3806.0, + "completions/mean_length": 828.353515625, + "completions/mean_terminated_length": 828.353515625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.3713203859777696, + "grad_norm": 0.28598293252101664, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 87423179.0, + "reward": 1.1630859375, + "reward_std": 0.2855790853500366, + "rewards/accuracy_reward/mean": 0.6640625, + "rewards/accuracy_reward/std": 0.4727790653705597, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3304.0, + "completions/max_terminated_length": 3304.0, + "completions/mean_length": 702.0390625, + "completions/mean_terminated_length": 702.0390625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.37327470379870525, + "grad_norm": 0.3022123048024596, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 87846399.0, + "reward": 1.208984375, + "reward_std": 0.23385296761989594, + "rewards/accuracy_reward/mean": 0.708984375, + "rewards/accuracy_reward/std": 0.45467492938041687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4684.0, + "completions/max_terminated_length": 4684.0, + "completions/mean_length": 851.568359375, + "completions/mean_terminated_length": 851.568359375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3752290216196409, + "grad_norm": 0.3004071685869428, + "learning_rate": 1e-06, + "loss": 0.0465, + "num_tokens": 88345570.0, + "reward": 1.12109375, + "reward_std": 0.3215859532356262, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4845963716506958, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4376.0, + "completions/max_terminated_length": 4376.0, + "completions/mean_length": 926.634765625, + "completions/mean_terminated_length": 926.634765625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.3771833394405765, + "grad_norm": 0.2879747142580191, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 88884039.0, + "reward": 0.9228515625, + "reward_std": 0.33400365710258484, + "rewards/accuracy_reward/mean": 0.427734375, + "rewards/accuracy_reward/std": 0.4952339828014374, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2528.0, + "completions/max_terminated_length": 2528.0, + "completions/mean_length": 1082.259765625, + "completions/mean_terminated_length": 1082.259765625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.37913765726151216, + "grad_norm": 0.22557582208547772, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 89505260.0, + "reward": 0.7177734375, + "reward_std": 0.24846723675727844, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41380295157432556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4278.0, + "completions/max_terminated_length": 4278.0, + "completions/mean_length": 1143.162109375, + "completions/mean_terminated_length": 1143.162109375, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.3810919750824478, + "grad_norm": 0.25137817466474477, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 90157935.0, + "reward": 0.783203125, + "reward_std": 0.3198698163032532, + "rewards/accuracy_reward/mean": 0.29296875, + "rewards/accuracy_reward/std": 0.455569326877594, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98046875, + "rewards/soft_format_reward/std": 0.1385180652141571, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3970.0, + "completions/max_terminated_length": 3970.0, + "completions/mean_length": 1246.98046875, + "completions/mean_terminated_length": 1246.98046875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.38304629290338343, + "grad_norm": 0.21536781139781788, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 90855781.0, + "reward": 0.826171875, + "reward_std": 0.29626399278640747, + "rewards/accuracy_reward/mean": 0.330078125, + "rewards/accuracy_reward/std": 0.47070086002349854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5409.0, + "completions/max_terminated_length": 5409.0, + "completions/mean_length": 1176.931640625, + "completions/mean_terminated_length": 1176.931640625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.38500061072431907, + "grad_norm": 0.2199653759582803, + "learning_rate": 1e-06, + "loss": 0.061, + "num_tokens": 91530034.0, + "reward": 0.7197265625, + "reward_std": 0.23972320556640625, + "rewards/accuracy_reward/mean": 0.23046875, + "rewards/accuracy_reward/std": 0.42154473066329956, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.978515625, + "rewards/soft_format_reward/std": 0.14513419568538666, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4664.0, + "completions/max_terminated_length": 4664.0, + "completions/mean_length": 1075.326171875, + "completions/mean_terminated_length": 1075.326171875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.38695492854525465, + "grad_norm": 0.2565343477123443, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 92143001.0, + "reward": 0.755859375, + "reward_std": 0.22612373530864716, + "rewards/accuracy_reward/mean": 0.259765625, + "rewards/accuracy_reward/std": 0.4389347732067108, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4826.0, + "completions/max_terminated_length": 4826.0, + "completions/mean_length": 1205.125, + "completions/mean_terminated_length": 1205.125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.3889092463661903, + "grad_norm": 0.22063416660776963, + "learning_rate": 1e-06, + "loss": 0.0341, + "num_tokens": 92826505.0, + "reward": 0.7744140625, + "reward_std": 0.2775408625602722, + "rewards/accuracy_reward/mean": 0.283203125, + "rewards/accuracy_reward/std": 0.4509948492050171, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.982421875, + "rewards/soft_format_reward/std": 0.13154059648513794, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3058.0, + "completions/max_terminated_length": 3058.0, + "completions/mean_length": 1091.607421875, + "completions/mean_terminated_length": 1091.607421875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.3908635641871259, + "grad_norm": 0.23702804785120077, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 93447392.0, + "reward": 0.7548828125, + "reward_std": 0.26539403200149536, + "rewards/accuracy_reward/mean": 0.255859375, + "rewards/accuracy_reward/std": 0.43676990270614624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4498.0, + "completions/max_terminated_length": 4498.0, + "completions/mean_length": 1175.162109375, + "completions/mean_terminated_length": 1175.162109375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.39281788200806156, + "grad_norm": 0.20705181516443527, + "learning_rate": 1e-06, + "loss": 0.0285, + "num_tokens": 94116099.0, + "reward": 0.7587890625, + "reward_std": 0.2381991147994995, + "rewards/accuracy_reward/mean": 0.26171875, + "rewards/accuracy_reward/std": 0.44000017642974854, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4417.0, + "completions/max_terminated_length": 4417.0, + "completions/mean_length": 1145.099609375, + "completions/mean_terminated_length": 1145.099609375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.3947721998289972, + "grad_norm": 0.2099595228228698, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 94766966.0, + "reward": 0.81640625, + "reward_std": 0.26490187644958496, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4670529365539551, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3856.0, + "completions/max_terminated_length": 3856.0, + "completions/mean_length": 1144.3359375, + "completions/mean_terminated_length": 1144.3359375, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "epoch": 0.39672651764993283, + "grad_norm": 0.21470038825714835, + "learning_rate": 1e-06, + "loss": 0.02, + "num_tokens": 95421058.0, + "reward": 0.767578125, + "reward_std": 0.26772674918174744, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.4461594223976135, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3890.0, + "completions/max_terminated_length": 3890.0, + "completions/mean_length": 1349.583984375, + "completions/mean_terminated_length": 1352.22509765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.39868083547086847, + "grad_norm": 0.19076525638590527, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 96180013.0, + "reward": 0.7177734375, + "reward_std": 0.27950817346572876, + "rewards/accuracy_reward/mean": 0.224609375, + "rewards/accuracy_reward/std": 0.41773295402526855, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5936.0, + "completions/max_terminated_length": 5936.0, + "completions/mean_length": 1273.45703125, + "completions/mean_terminated_length": 1273.45703125, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.4006351532918041, + "grad_norm": 0.21652044241184726, + "learning_rate": 1e-06, + "loss": 0.0345, + "num_tokens": 96892951.0, + "reward": 0.7353515625, + "reward_std": 0.2698529362678528, + "rewards/accuracy_reward/mean": 0.244140625, + "rewards/accuracy_reward/std": 0.42999663949012756, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.982421875, + "rewards/soft_format_reward/std": 0.13154059648513794, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5038.0, + "completions/max_terminated_length": 5038.0, + "completions/mean_length": 1137.302734375, + "completions/mean_terminated_length": 1137.302734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.4025894711127397, + "grad_norm": 0.22644024391524878, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 97544546.0, + "reward": 0.8134765625, + "reward_std": 0.2465001791715622, + "rewards/accuracy_reward/mean": 0.318359375, + "rewards/accuracy_reward/std": 0.46629536151885986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5038.0, + "completions/max_terminated_length": 5038.0, + "completions/mean_length": 1275.13671875, + "completions/mean_terminated_length": 1275.13671875, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.4045437889336753, + "grad_norm": 0.233198079125074, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 98275944.0, + "reward": 0.7626953125, + "reward_std": 0.2693719267845154, + "rewards/accuracy_reward/mean": 0.26953125, + "rewards/accuracy_reward/std": 0.44415023922920227, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4584.0, + "completions/max_terminated_length": 4584.0, + "completions/mean_length": 1327.673828125, + "completions/mean_terminated_length": 1327.673828125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.40649810675461095, + "grad_norm": 0.20798303455877504, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 99025441.0, + "reward": 0.763671875, + "reward_std": 0.29067397117614746, + "rewards/accuracy_reward/mean": 0.267578125, + "rewards/accuracy_reward/std": 0.4431293308734894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7082.0, + "completions/max_terminated_length": 7082.0, + "completions/mean_length": 1343.33203125, + "completions/mean_terminated_length": 1343.33203125, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.4084524245755466, + "grad_norm": 0.2226696338224605, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 99799835.0, + "reward": 0.7998046875, + "reward_std": 0.2912616729736328, + "rewards/accuracy_reward/mean": 0.302734375, + "rewards/accuracy_reward/std": 0.45989060401916504, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5320.0, + "completions/max_terminated_length": 5320.0, + "completions/mean_length": 1295.900390625, + "completions/mean_terminated_length": 1300.982421875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.4104067423964822, + "grad_norm": 0.2258503859721909, + "learning_rate": 1e-06, + "loss": 0.0312, + "num_tokens": 100540520.0, + "reward": 0.7783203125, + "reward_std": 0.30941659212112427, + "rewards/accuracy_reward/mean": 0.28515625, + "rewards/accuracy_reward/std": 0.45193037390708923, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4738.0, + "completions/max_terminated_length": 4738.0, + "completions/mean_length": 1241.978515625, + "completions/mean_terminated_length": 1246.84912109375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.41236106021741786, + "grad_norm": 0.22559044306458934, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 101245405.0, + "reward": 0.96484375, + "reward_std": 0.2747163772583008, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.4995105266571045, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4499.0, + "completions/max_terminated_length": 4499.0, + "completions/mean_length": 1234.599609375, + "completions/mean_terminated_length": 1234.599609375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.4143153780383535, + "grad_norm": 0.25349387253385525, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 101953728.0, + "reward": 0.8359375, + "reward_std": 0.2764972448348999, + "rewards/accuracy_reward/mean": 0.337890625, + "rewards/accuracy_reward/std": 0.4734536409378052, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4785.0, + "completions/max_terminated_length": 4785.0, + "completions/mean_length": 1338.033203125, + "completions/mean_terminated_length": 1340.651611328125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.41626969585928913, + "grad_norm": 0.1976377462802431, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 102706001.0, + "reward": 0.7177734375, + "reward_std": 0.2124912142753601, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5831.0, + "completions/max_terminated_length": 5831.0, + "completions/mean_length": 1325.44140625, + "completions/mean_terminated_length": 1325.44140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.41822401368022477, + "grad_norm": 0.2361089462857162, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 103461395.0, + "reward": 0.8115234375, + "reward_std": 0.3174085319042206, + "rewards/accuracy_reward/mean": 0.318359375, + "rewards/accuracy_reward/std": 0.46629536151885986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3854.0, + "completions/max_terminated_length": 3854.0, + "completions/mean_length": 1116.19140625, + "completions/mean_terminated_length": 1116.19140625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.42017833150116035, + "grad_norm": 0.2540272445895197, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 104104005.0, + "reward": 0.9619140625, + "reward_std": 0.3506205081939697, + "rewards/accuracy_reward/mean": 0.462890625, + "rewards/accuracy_reward/std": 0.4991086423397064, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4529.0, + "completions/max_terminated_length": 4529.0, + "completions/mean_length": 1149.2734375, + "completions/mean_terminated_length": 1149.2734375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.422132649322096, + "grad_norm": 0.23547522599544052, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 104773505.0, + "reward": 0.96875, + "reward_std": 0.32343557476997375, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.4995105266571045, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3908.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 1168.609375, + "completions/mean_terminated_length": 1168.609375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.4240869671430316, + "grad_norm": 0.2694663181990323, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 105451321.0, + "reward": 1.01171875, + "reward_std": 0.33037665486335754, + "rewards/accuracy_reward/mean": 0.51171875, + "rewards/accuracy_reward/std": 0.5003514885902405, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3008.0, + "completions/max_terminated_length": 3008.0, + "completions/mean_length": 1041.546875, + "completions/mean_terminated_length": 1041.546875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.42604128496396726, + "grad_norm": 0.27548566439792915, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 106059969.0, + "reward": 0.9931640625, + "reward_std": 0.32911938428878784, + "rewards/accuracy_reward/mean": 0.494140625, + "rewards/accuracy_reward/std": 0.5004546642303467, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6533.0, + "completions/max_terminated_length": 6533.0, + "completions/mean_length": 1305.521484375, + "completions/mean_terminated_length": 1305.521484375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.4279956027849029, + "grad_norm": 0.21654847321631024, + "learning_rate": 1e-06, + "loss": 0.037, + "num_tokens": 106789916.0, + "reward": 0.9052734375, + "reward_std": 0.3767821192741394, + "rewards/accuracy_reward/mean": 0.41015625, + "rewards/accuracy_reward/std": 0.49234291911125183, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3495.0, + "completions/max_terminated_length": 3495.0, + "completions/mean_length": 1336.03515625, + "completions/mean_terminated_length": 1336.03515625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.42994992060583853, + "grad_norm": 0.2102321646796608, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 107550798.0, + "reward": 0.82421875, + "reward_std": 0.2967522144317627, + "rewards/accuracy_reward/mean": 0.326171875, + "rewards/accuracy_reward/std": 0.4692695140838623, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7752.0, + "completions/max_terminated_length": 7752.0, + "completions/mean_length": 1345.125, + "completions/mean_terminated_length": 1345.125, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.43190423842677417, + "grad_norm": 0.1817939659335885, + "learning_rate": 1e-06, + "loss": 0.0244, + "num_tokens": 108308318.0, + "reward": 0.8525390625, + "reward_std": 0.2916116714477539, + "rewards/accuracy_reward/mean": 0.357421875, + "rewards/accuracy_reward/std": 0.4797092080116272, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5444.0, + "completions/max_terminated_length": 5444.0, + "completions/mean_length": 1438.9453125, + "completions/mean_terminated_length": 1438.9453125, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "epoch": 0.4338585562477098, + "grad_norm": 0.2012087253582826, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 109124962.0, + "reward": 0.7861328125, + "reward_std": 0.34080904722213745, + "rewards/accuracy_reward/mean": 0.291015625, + "rewards/accuracy_reward/std": 0.45467492938041687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4203.0, + "completions/max_terminated_length": 4203.0, + "completions/mean_length": 1374.662109375, + "completions/mean_terminated_length": 1374.662109375, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.43581287406864544, + "grad_norm": 0.23799113269890557, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 109906117.0, + "reward": 0.900390625, + "reward_std": 0.42115750908851624, + "rewards/accuracy_reward/mean": 0.40234375, + "rewards/accuracy_reward/std": 0.4908501207828522, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3370.0, + "completions/max_terminated_length": 3370.0, + "completions/mean_length": 1393.76953125, + "completions/mean_terminated_length": 1393.76953125, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.437767191889581, + "grad_norm": 0.22347610625716982, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 110696031.0, + "reward": 0.869140625, + "reward_std": 0.3635779917240143, + "rewards/accuracy_reward/mean": 0.37109375, + "rewards/accuracy_reward/std": 0.4835699498653412, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5379.0, + "completions/max_terminated_length": 5379.0, + "completions/mean_length": 1430.97265625, + "completions/mean_terminated_length": 1430.97265625, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "epoch": 0.43972150971051666, + "grad_norm": 0.19406622799312084, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 111506465.0, + "reward": 0.8828125, + "reward_std": 0.3225821554660797, + "rewards/accuracy_reward/mean": 0.384765625, + "rewards/accuracy_reward/std": 0.4870156943798065, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4370.0, + "completions/max_terminated_length": 4370.0, + "completions/mean_length": 1287.361328125, + "completions/mean_terminated_length": 1287.361328125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.4416758275314523, + "grad_norm": 0.25972222287767005, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 112238442.0, + "reward": 0.890625, + "reward_std": 0.3257838785648346, + "rewards/accuracy_reward/mean": 0.39453125, + "rewards/accuracy_reward/std": 0.4892277717590332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3345.0, + "completions/max_terminated_length": 3345.0, + "completions/mean_length": 1179.46875, + "completions/mean_terminated_length": 1179.46875, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.44363014535238793, + "grad_norm": 0.2560386867692662, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 112905978.0, + "reward": 0.890625, + "reward_std": 0.3570018410682678, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.48836761713027954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5123.0, + "completions/max_terminated_length": 5123.0, + "completions/mean_length": 1219.68359375, + "completions/mean_terminated_length": 1219.68359375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.44558446317332356, + "grad_norm": 0.2356414984480452, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 113590968.0, + "reward": 0.837890625, + "reward_std": 0.2830570936203003, + "rewards/accuracy_reward/mean": 0.337890625, + "rewards/accuracy_reward/std": 0.4734536409378052, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3987.0, + "completions/max_terminated_length": 3987.0, + "completions/mean_length": 1295.80859375, + "completions/mean_terminated_length": 1295.80859375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.4475387809942592, + "grad_norm": 0.22352450921484546, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 114329398.0, + "reward": 0.8134765625, + "reward_std": 0.27998289465904236, + "rewards/accuracy_reward/mean": 0.314453125, + "rewards/accuracy_reward/std": 0.4647517800331116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4395.0, + "completions/max_terminated_length": 4395.0, + "completions/mean_length": 1367.083984375, + "completions/mean_terminated_length": 1367.083984375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.44949309881519484, + "grad_norm": 0.2214601942019932, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 115097153.0, + "reward": 0.81640625, + "reward_std": 0.28603070974349976, + "rewards/accuracy_reward/mean": 0.31640625, + "rewards/accuracy_reward/std": 0.46552830934524536, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5252.0, + "completions/max_terminated_length": 5252.0, + "completions/mean_length": 1430.935546875, + "completions/mean_terminated_length": 1430.935546875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.45144741663613047, + "grad_norm": 0.25920374643043276, + "learning_rate": 1e-06, + "loss": 0.0471, + "num_tokens": 115893440.0, + "reward": 0.974609375, + "reward_std": 0.36862912774086, + "rewards/accuracy_reward/mean": 0.478515625, + "rewards/accuracy_reward/std": 0.5000267624855042, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7174.0, + "completions/max_terminated_length": 7174.0, + "completions/mean_length": 1280.830078125, + "completions/mean_terminated_length": 1280.830078125, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.4534017344570661, + "grad_norm": 0.2592376098126605, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 116614953.0, + "reward": 0.921875, + "reward_std": 0.3603953719139099, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49434176087379456, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4907.0, + "completions/max_terminated_length": 4907.0, + "completions/mean_length": 1326.353515625, + "completions/mean_terminated_length": 1326.353515625, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.4553560522780017, + "grad_norm": 0.23780888607870893, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 117360670.0, + "reward": 0.87109375, + "reward_std": 0.27507734298706055, + "rewards/accuracy_reward/mean": 0.37109375, + "rewards/accuracy_reward/std": 0.4835699498653412, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4721.0, + "completions/max_terminated_length": 4721.0, + "completions/mean_length": 1375.630859375, + "completions/mean_terminated_length": 1375.630859375, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.4573103700989373, + "grad_norm": 0.22307879268164593, + "learning_rate": 1e-06, + "loss": 0.0422, + "num_tokens": 118129409.0, + "reward": 0.845703125, + "reward_std": 0.2507952153682709, + "rewards/accuracy_reward/mean": 0.349609375, + "rewards/accuracy_reward/std": 0.47731292247772217, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4826.0, + "completions/max_terminated_length": 4826.0, + "completions/mean_length": 1179.0390625, + "completions/mean_terminated_length": 1179.0390625, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.45926468791987296, + "grad_norm": 0.27374722026205006, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 118799109.0, + "reward": 0.916015625, + "reward_std": 0.32726892828941345, + "rewards/accuracy_reward/mean": 0.41796875, + "rewards/accuracy_reward/std": 0.4937073290348053, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5180.0, + "completions/max_terminated_length": 5180.0, + "completions/mean_length": 1379.52734375, + "completions/mean_terminated_length": 1384.9373779296875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.4612190057408086, + "grad_norm": 0.22628569111337152, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 119570307.0, + "reward": 0.71875, + "reward_std": 0.22908148169517517, + "rewards/accuracy_reward/mean": 0.224609375, + "rewards/accuracy_reward/std": 0.41773295402526855, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5222.0, + "completions/max_terminated_length": 5222.0, + "completions/mean_length": 1257.201171875, + "completions/mean_terminated_length": 1259.6614990234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.46317332356174423, + "grad_norm": 0.24451834127648459, + "learning_rate": 1e-06, + "loss": 0.0384, + "num_tokens": 120283930.0, + "reward": 0.7685546875, + "reward_std": 0.26285383105278015, + "rewards/accuracy_reward/mean": 0.271484375, + "rewards/accuracy_reward/std": 0.44516023993492126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6179.0, + "completions/max_terminated_length": 6179.0, + "completions/mean_length": 1318.66796875, + "completions/mean_terminated_length": 1318.66796875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.46512764138267987, + "grad_norm": 0.24597313505166876, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 121026864.0, + "reward": 0.765625, + "reward_std": 0.32728978991508484, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.4461594223976135, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6327.0, + "completions/max_terminated_length": 6327.0, + "completions/mean_length": 1366.318359375, + "completions/mean_terminated_length": 1368.9921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.4670819592036155, + "grad_norm": 0.22637809303427706, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 121798723.0, + "reward": 0.814453125, + "reward_std": 0.29507216811180115, + "rewards/accuracy_reward/mean": 0.318359375, + "rewards/accuracy_reward/std": 0.46629536151885986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5243.0, + "completions/max_terminated_length": 5243.0, + "completions/mean_length": 1421.259765625, + "completions/mean_terminated_length": 1424.0411376953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.46903627702455114, + "grad_norm": 0.20235871272365902, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 122595320.0, + "reward": 0.7158203125, + "reward_std": 0.24383221566677094, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6880.0, + "completions/max_terminated_length": 6880.0, + "completions/mean_length": 1486.3359375, + "completions/mean_terminated_length": 1489.24462890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.4709905948454867, + "grad_norm": 0.18297647778398776, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 123422420.0, + "reward": 0.7001953125, + "reward_std": 0.24867942929267883, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 7443.0, + "completions/max_terminated_length": 7443.0, + "completions/mean_length": 1450.201171875, + "completions/mean_terminated_length": 1455.8883056640625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.47294491266642236, + "grad_norm": 0.21710924957726246, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 124227563.0, + "reward": 0.6328125, + "reward_std": 0.21857893466949463, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6167.0, + "completions/max_terminated_length": 6167.0, + "completions/mean_length": 1411.826171875, + "completions/mean_terminated_length": 1414.5889892578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.474899230487358, + "grad_norm": 0.19157780526603888, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 125007618.0, + "reward": 0.7275390625, + "reward_std": 0.19637173414230347, + "rewards/accuracy_reward/mean": 0.232421875, + "rewards/accuracy_reward/std": 0.42278963327407837, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7651.0, + "completions/max_terminated_length": 7651.0, + "completions/mean_length": 1408.92578125, + "completions/mean_terminated_length": 1408.92578125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.47685354830829363, + "grad_norm": 0.2816542431253473, + "learning_rate": 1e-06, + "loss": 0.0413, + "num_tokens": 125783980.0, + "reward": 0.744140625, + "reward_std": 0.27061402797698975, + "rewards/accuracy_reward/mean": 0.251953125, + "rewards/accuracy_reward/std": 0.43455907702445984, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5019.0, + "completions/max_terminated_length": 5019.0, + "completions/mean_length": 1378.587890625, + "completions/mean_terminated_length": 1378.587890625, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.47880786612922926, + "grad_norm": 0.30246844173470644, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 126558009.0, + "reward": 0.787109375, + "reward_std": 0.287064790725708, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45377036929130554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5088.0, + "completions/max_terminated_length": 5088.0, + "completions/mean_length": 1405.908203125, + "completions/mean_terminated_length": 1408.659423828125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.4807621839501649, + "grad_norm": 0.21886083982415624, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 127358202.0, + "reward": 0.8046875, + "reward_std": 0.2793833613395691, + "rewards/accuracy_reward/mean": 0.306640625, + "rewards/accuracy_reward/std": 0.4615498185157776, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3183.0, + "completions/max_terminated_length": 3183.0, + "completions/mean_length": 1284.67578125, + "completions/mean_terminated_length": 1284.67578125, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.48271650177110054, + "grad_norm": 0.25856410381624856, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 128092852.0, + "reward": 0.9375, + "reward_std": 0.29149410128593445, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49656352400779724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6179.0, + "completions/max_terminated_length": 6179.0, + "completions/mean_length": 1439.369140625, + "completions/mean_terminated_length": 1442.1859130859375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.4846708195920362, + "grad_norm": 0.22831274195453724, + "learning_rate": 1e-06, + "loss": 0.0409, + "num_tokens": 128901361.0, + "reward": 0.7568359375, + "reward_std": 0.23879684507846832, + "rewards/accuracy_reward/mean": 0.263671875, + "rewards/accuracy_reward/std": 0.4410543739795685, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5848.0, + "completions/max_terminated_length": 5848.0, + "completions/mean_length": 1296.52734375, + "completions/mean_terminated_length": 1296.52734375, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.4866251374129718, + "grad_norm": 0.22329122316767686, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 129630479.0, + "reward": 0.775390625, + "reward_std": 0.2518594264984131, + "rewards/accuracy_reward/mean": 0.275390625, + "rewards/accuracy_reward/std": 0.44714778661727905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6675.0, + "completions/max_terminated_length": 6675.0, + "completions/mean_length": 1384.455078125, + "completions/mean_terminated_length": 1384.455078125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.4885794552339074, + "grad_norm": 0.2778058765163096, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 130400376.0, + "reward": 0.7763671875, + "reward_std": 0.26398348808288574, + "rewards/accuracy_reward/mean": 0.279296875, + "rewards/accuracy_reward/std": 0.44909247756004333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5525.0, + "completions/max_terminated_length": 5525.0, + "completions/mean_length": 1225.50390625, + "completions/mean_terminated_length": 1230.3099365234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.490533773054843, + "grad_norm": 0.30235612141076385, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 131096106.0, + "reward": 0.9619140625, + "reward_std": 0.2862256169319153, + "rewards/accuracy_reward/mean": 0.462890625, + "rewards/accuracy_reward/std": 0.4991086423397064, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6525.0, + "completions/max_terminated_length": 6525.0, + "completions/mean_length": 1320.34765625, + "completions/mean_terminated_length": 1322.9315185546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.49248809087577866, + "grad_norm": 0.24511335524371472, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 131854364.0, + "reward": 0.7978515625, + "reward_std": 0.21307554841041565, + "rewards/accuracy_reward/mean": 0.298828125, + "rewards/accuracy_reward/std": 0.45819199085235596, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7345.0, + "completions/max_terminated_length": 7345.0, + "completions/mean_length": 1426.18359375, + "completions/mean_terminated_length": 1428.974609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.4944424086967143, + "grad_norm": 0.27999240403032843, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 132652458.0, + "reward": 0.6630859375, + "reward_std": 0.26357370615005493, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6737.0, + "completions/max_terminated_length": 6737.0, + "completions/mean_length": 1452.083984375, + "completions/mean_terminated_length": 1454.9256591796875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 539.0, + "epoch": 0.49639672651764993, + "grad_norm": 0.24018835743384306, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 133463237.0, + "reward": 0.7197265625, + "reward_std": 0.25255855917930603, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5223.0, + "completions/max_terminated_length": 5223.0, + "completions/mean_length": 1393.23046875, + "completions/mean_terminated_length": 1395.9569091796875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 485.0, + "epoch": 0.49835104433858557, + "grad_norm": 0.2687682111588074, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 134259579.0, + "reward": 0.7626953125, + "reward_std": 0.32618480920791626, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44209739565849304, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5163.0, + "completions/max_terminated_length": 5163.0, + "completions/mean_length": 1310.626953125, + "completions/mean_terminated_length": 1313.1917724609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.5003053621595211, + "grad_norm": 0.25356101827433064, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 135003852.0, + "reward": 0.7578125, + "reward_std": 0.28216585516929626, + "rewards/accuracy_reward/mean": 0.259765625, + "rewards/accuracy_reward/std": 0.4389347732067108, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6053.0, + "completions/max_terminated_length": 6053.0, + "completions/mean_length": 1326.267578125, + "completions/mean_terminated_length": 1328.863037109375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.5022596799804568, + "grad_norm": 0.28274453519554865, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 135757333.0, + "reward": 0.7548828125, + "reward_std": 0.28269314765930176, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43785804510116577, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7298.0, + "completions/max_terminated_length": 7298.0, + "completions/mean_length": 1276.283203125, + "completions/mean_terminated_length": 1276.283203125, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.5042139978013924, + "grad_norm": 0.264190957387748, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 136481494.0, + "reward": 0.908203125, + "reward_std": 0.2794644832611084, + "rewards/accuracy_reward/mean": 0.412109375, + "rewards/accuracy_reward/std": 0.49269601702690125, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5454.0, + "completions/max_terminated_length": 5454.0, + "completions/mean_length": 1283.951171875, + "completions/mean_terminated_length": 1288.986328125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.5061683156223281, + "grad_norm": 0.25917234519575116, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 137212861.0, + "reward": 0.896484375, + "reward_std": 0.2996416389942169, + "rewards/accuracy_reward/mean": 0.400390625, + "rewards/accuracy_reward/std": 0.4904567301273346, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5830.0, + "completions/max_terminated_length": 5830.0, + "completions/mean_length": 1299.919921875, + "completions/mean_terminated_length": 1305.0177001953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.5081226334432637, + "grad_norm": 0.28383153230669234, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 137944340.0, + "reward": 0.86328125, + "reward_std": 0.3364277482032776, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.48250964283943176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4025.0, + "completions/max_terminated_length": 4025.0, + "completions/mean_length": 1345.05859375, + "completions/mean_terminated_length": 1347.6907958984375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 491.0, + "epoch": 0.5100769512641994, + "grad_norm": 0.23334300654895376, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 138699666.0, + "reward": 0.8056640625, + "reward_std": 0.30599480867385864, + "rewards/accuracy_reward/mean": 0.306640625, + "rewards/accuracy_reward/std": 0.4615498185157776, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6072.0, + "completions/max_terminated_length": 6072.0, + "completions/mean_length": 1238.298828125, + "completions/mean_terminated_length": 1243.155029296875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.512031269085135, + "grad_norm": 0.25104056067996106, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 139394827.0, + "reward": 0.837890625, + "reward_std": 0.2851397395133972, + "rewards/accuracy_reward/mean": 0.341796875, + "rewards/accuracy_reward/std": 0.4747757613658905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 6896.0, + "completions/max_terminated_length": 6896.0, + "completions/mean_length": 1415.869140625, + "completions/mean_terminated_length": 1427.0177001953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.5139855869060705, + "grad_norm": 0.21419747556393065, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 140180472.0, + "reward": 0.7587890625, + "reward_std": 0.24801716208457947, + "rewards/accuracy_reward/mean": 0.267578125, + "rewards/accuracy_reward/std": 0.4431293308734894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.982421875, + "rewards/soft_format_reward/std": 0.13154059648513794, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5876.0, + "completions/max_terminated_length": 5876.0, + "completions/mean_length": 1273.380859375, + "completions/mean_terminated_length": 1273.380859375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.5159399047270062, + "grad_norm": 0.23148535928568678, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 140892891.0, + "reward": 0.703125, + "reward_std": 0.23366022109985352, + "rewards/accuracy_reward/mean": 0.205078125, + "rewards/accuracy_reward/std": 0.4041535556316376, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4633.0, + "completions/max_terminated_length": 4633.0, + "completions/mean_length": 1164.72265625, + "completions/mean_terminated_length": 1164.72265625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.5178942225479418, + "grad_norm": 0.21158873936329442, + "learning_rate": 1e-06, + "loss": 0.009, + "num_tokens": 141554605.0, + "reward": 0.7392578125, + "reward_std": 0.215408056974411, + "rewards/accuracy_reward/mean": 0.240234375, + "rewards/accuracy_reward/std": 0.4276435375213623, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7732.0, + "completions/max_terminated_length": 7732.0, + "completions/mean_length": 1325.74609375, + "completions/mean_terminated_length": 1328.3404541015625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.5198485403688775, + "grad_norm": 0.3461808786839474, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 142315499.0, + "reward": 0.7158203125, + "reward_std": 0.22852852940559387, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5986.0, + "completions/max_terminated_length": 5986.0, + "completions/mean_length": 1203.49609375, + "completions/mean_terminated_length": 1205.851318359375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.5218028581898131, + "grad_norm": 0.1892290875996238, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 142994329.0, + "reward": 0.6953125, + "reward_std": 0.16969367861747742, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 5350.0, + "completions/max_terminated_length": 5350.0, + "completions/mean_length": 1321.11328125, + "completions/mean_terminated_length": 1331.5157470703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.5237571760107488, + "grad_norm": 0.1952732332122388, + "learning_rate": 1e-06, + "loss": 0.0191, + "num_tokens": 143737539.0, + "reward": 0.66015625, + "reward_std": 0.20711097121238708, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6096.0, + "completions/max_terminated_length": 6096.0, + "completions/mean_length": 1102.919921875, + "completions/mean_terminated_length": 1102.919921875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5257114938316844, + "grad_norm": 0.21497685469215014, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 144367754.0, + "reward": 0.669921875, + "reward_std": 0.17656955122947693, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6182.0, + "completions/max_terminated_length": 6182.0, + "completions/mean_length": 1240.73046875, + "completions/mean_terminated_length": 1243.158447265625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.52766581165262, + "grad_norm": 0.25688160826462914, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 145070656.0, + "reward": 0.6376953125, + "reward_std": 0.22297914326190948, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5773.0, + "completions/max_terminated_length": 5773.0, + "completions/mean_length": 1338.279296875, + "completions/mean_terminated_length": 1340.898193359375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.5296201294735556, + "grad_norm": 0.1953500641217408, + "learning_rate": 1e-06, + "loss": 0.0259, + "num_tokens": 145841727.0, + "reward": 0.8134765625, + "reward_std": 0.20196539163589478, + "rewards/accuracy_reward/mean": 0.31640625, + "rewards/accuracy_reward/std": 0.46552830934524536, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5437.0, + "completions/max_terminated_length": 5437.0, + "completions/mean_length": 1325.4140625, + "completions/mean_terminated_length": 1330.61181640625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.5315744472944912, + "grad_norm": 0.19925701187467806, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 146611955.0, + "reward": 0.6962890625, + "reward_std": 0.2049601823091507, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7582.0, + "completions/max_terminated_length": 7582.0, + "completions/mean_length": 1306.53515625, + "completions/mean_terminated_length": 1306.53515625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.5335287651154269, + "grad_norm": 0.24343556855221313, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 147357205.0, + "reward": 0.751953125, + "reward_std": 0.2099478542804718, + "rewards/accuracy_reward/mean": 0.25390625, + "rewards/accuracy_reward/std": 0.43567025661468506, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 8147.0, + "completions/max_terminated_length": 8147.0, + "completions/mean_length": 1408.68359375, + "completions/mean_terminated_length": 1411.4403076171875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.5354830829363625, + "grad_norm": 0.19996323742397923, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 148145987.0, + "reward": 0.712890625, + "reward_std": 0.214003324508667, + "rewards/accuracy_reward/mean": 0.216796875, + "rewards/accuracy_reward/std": 0.4124660789966583, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6031.0, + "completions/max_terminated_length": 6031.0, + "completions/mean_length": 1197.4296875, + "completions/mean_terminated_length": 1199.77294921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.5374374007572982, + "grad_norm": 0.27909572562391854, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 148837039.0, + "reward": 0.8544921875, + "reward_std": 0.35284459590911865, + "rewards/accuracy_reward/mean": 0.357421875, + "rewards/accuracy_reward/std": 0.4797092080116272, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5266.0, + "completions/max_terminated_length": 5266.0, + "completions/mean_length": 1256.677734375, + "completions/mean_terminated_length": 1259.136962890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.5393917185782338, + "grad_norm": 0.280750755313218, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 149558746.0, + "reward": 0.8486328125, + "reward_std": 0.27322277426719666, + "rewards/accuracy_reward/mean": 0.349609375, + "rewards/accuracy_reward/std": 0.47731292247772217, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5315.0, + "completions/max_terminated_length": 5315.0, + "completions/mean_length": 1281.140625, + "completions/mean_terminated_length": 1283.647705078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.5413460363991695, + "grad_norm": 0.28173355658269544, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 150280162.0, + "reward": 0.748046875, + "reward_std": 0.25995174050331116, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.43343618512153625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6195.0, + "completions/max_terminated_length": 6195.0, + "completions/mean_length": 1367.12890625, + "completions/mean_terminated_length": 1369.8043212890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.543300354220105, + "grad_norm": 0.22178379988373031, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 151056052.0, + "reward": 0.7197265625, + "reward_std": 0.2542484998703003, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5252.0, + "completions/max_terminated_length": 5252.0, + "completions/mean_length": 1245.962890625, + "completions/mean_terminated_length": 1245.962890625, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.5452546720410407, + "grad_norm": 0.22481190442387144, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 151770369.0, + "reward": 0.73828125, + "reward_std": 0.2011508047580719, + "rewards/accuracy_reward/mean": 0.23828125, + "rewards/accuracy_reward/std": 0.42644867300987244, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8155.0, + "completions/max_terminated_length": 8155.0, + "completions/mean_length": 1266.732421875, + "completions/mean_terminated_length": 1266.732421875, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.5472089898619763, + "grad_norm": 0.2727355099335686, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 152500520.0, + "reward": 0.7646484375, + "reward_std": 0.2716621160507202, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44209739565849304, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4863.0, + "completions/max_terminated_length": 4863.0, + "completions/mean_length": 1249.373046875, + "completions/mean_terminated_length": 1251.8179931640625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.5491633076829119, + "grad_norm": 0.254467816357727, + "learning_rate": 1e-06, + "loss": 0.0214, + "num_tokens": 153217751.0, + "reward": 0.80078125, + "reward_std": 0.2420767992734909, + "rewards/accuracy_reward/mean": 0.302734375, + "rewards/accuracy_reward/std": 0.45989060401916504, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5845.0, + "completions/max_terminated_length": 5845.0, + "completions/mean_length": 1241.142578125, + "completions/mean_terminated_length": 1241.142578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.5511176255038476, + "grad_norm": 0.26731503717496, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 153928944.0, + "reward": 0.744140625, + "reward_std": 0.31095176935195923, + "rewards/accuracy_reward/mean": 0.244140625, + "rewards/accuracy_reward/std": 0.42999663949012756, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4502.0, + "completions/max_terminated_length": 4502.0, + "completions/mean_length": 1141.52734375, + "completions/mean_terminated_length": 1141.52734375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5530719433247832, + "grad_norm": 0.29543714797132775, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 154582718.0, + "reward": 0.8837890625, + "reward_std": 0.29443037509918213, + "rewards/accuracy_reward/mean": 0.38671875, + "rewards/accuracy_reward/std": 0.48747459053993225, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5123.0, + "completions/max_terminated_length": 5123.0, + "completions/mean_length": 1136.943359375, + "completions/mean_terminated_length": 1139.1683349609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5550262611457188, + "grad_norm": 0.2828952806622914, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 155226529.0, + "reward": 0.8955078125, + "reward_std": 0.37791115045547485, + "rewards/accuracy_reward/mean": 0.396484375, + "rewards/accuracy_reward/std": 0.4896455705165863, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5720.0, + "completions/max_terminated_length": 5720.0, + "completions/mean_length": 1207.5, + "completions/mean_terminated_length": 1207.5, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.5569805789666544, + "grad_norm": 0.19922256568936855, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 155907697.0, + "reward": 0.818359375, + "reward_std": 0.20037144422531128, + "rewards/accuracy_reward/mean": 0.318359375, + "rewards/accuracy_reward/std": 0.46629536151885986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5457.0, + "completions/max_terminated_length": 5457.0, + "completions/mean_length": 1202.625, + "completions/mean_terminated_length": 1202.625, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.5589348967875901, + "grad_norm": 0.20659242696877841, + "learning_rate": 1e-06, + "loss": -0.0089, + "num_tokens": 156589761.0, + "reward": 0.767578125, + "reward_std": 0.21927177906036377, + "rewards/accuracy_reward/mean": 0.267578125, + "rewards/accuracy_reward/std": 0.4431293308734894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5125.0, + "completions/max_terminated_length": 5125.0, + "completions/mean_length": 1209.4453125, + "completions/mean_terminated_length": 1209.4453125, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.5608892146085257, + "grad_norm": 0.21727037733888616, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 157281029.0, + "reward": 0.7041015625, + "reward_std": 0.2016957402229309, + "rewards/accuracy_reward/mean": 0.205078125, + "rewards/accuracy_reward/std": 0.4041535556316376, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6177.0, + "completions/max_terminated_length": 6177.0, + "completions/mean_length": 1437.296875, + "completions/mean_terminated_length": 1437.296875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.5628435324294614, + "grad_norm": 0.18727838735758034, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 158075677.0, + "reward": 0.638671875, + "reward_std": 0.23068195581436157, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3439.0, + "completions/max_terminated_length": 3439.0, + "completions/mean_length": 1318.36328125, + "completions/mean_terminated_length": 1318.36328125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "epoch": 0.564797850250397, + "grad_norm": 0.19785696739892583, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 158815367.0, + "reward": 0.7421875, + "reward_std": 0.22653764486312866, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.42882615327835083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5582.0, + "completions/max_terminated_length": 5582.0, + "completions/mean_length": 1262.37109375, + "completions/mean_terminated_length": 1262.37109375, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.5667521680713326, + "grad_norm": 0.2540364603911545, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 159528357.0, + "reward": 0.7841796875, + "reward_std": 0.2865217924118042, + "rewards/accuracy_reward/mean": 0.28515625, + "rewards/accuracy_reward/std": 0.45193037390708923, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4644.0, + "completions/max_terminated_length": 4644.0, + "completions/mean_length": 1258.876953125, + "completions/mean_terminated_length": 1263.8138427734375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.5687064858922682, + "grad_norm": 0.19737307518232416, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 160255286.0, + "reward": 0.7158203125, + "reward_std": 0.2256154865026474, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41380295157432556, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7409.0, + "completions/max_terminated_length": 7409.0, + "completions/mean_length": 1281.404296875, + "completions/mean_terminated_length": 1281.404296875, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.5706608037132038, + "grad_norm": 0.18615225280711617, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 160986821.0, + "reward": 0.6953125, + "reward_std": 0.19349795579910278, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3968288004398346, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3346.0, + "completions/max_terminated_length": 3346.0, + "completions/mean_length": 1142.7578125, + "completions/mean_terminated_length": 1144.994140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.5726151215341395, + "grad_norm": 0.24115677670650662, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 161646329.0, + "reward": 0.8515625, + "reward_std": 0.2602195143699646, + "rewards/accuracy_reward/mean": 0.353515625, + "rewards/accuracy_reward/std": 0.47852855920791626, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4570.0, + "completions/max_terminated_length": 4570.0, + "completions/mean_length": 1345.5546875, + "completions/mean_terminated_length": 1345.5546875, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.5745694393550751, + "grad_norm": 0.18825432660305658, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 162397765.0, + "reward": 0.63671875, + "reward_std": 0.19244548678398132, + "rewards/accuracy_reward/mean": 0.13671875, + "rewards/accuracy_reward/std": 0.3438861668109894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6217.0, + "completions/max_terminated_length": 6217.0, + "completions/mean_length": 1246.3046875, + "completions/mean_terminated_length": 1246.3046875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.5765237571760108, + "grad_norm": 0.24728104372391582, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 163112369.0, + "reward": 0.7451171875, + "reward_std": 0.26990675926208496, + "rewards/accuracy_reward/mean": 0.24609375, + "rewards/accuracy_reward/std": 0.4311550557613373, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4546.0, + "completions/max_terminated_length": 4546.0, + "completions/mean_length": 1203.359375, + "completions/mean_terminated_length": 1203.359375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.5784780749969464, + "grad_norm": 0.29193703171516394, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 163804297.0, + "reward": 0.8984375, + "reward_std": 0.3659275770187378, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4900552034378052, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5390.0, + "completions/max_terminated_length": 5390.0, + "completions/mean_length": 1366.40625, + "completions/mean_terminated_length": 1369.0802001953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.5804323928178821, + "grad_norm": 0.1842092204397344, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 164572713.0, + "reward": 0.7099609375, + "reward_std": 0.23039336502552032, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4083731174468994, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7714.0, + "completions/max_terminated_length": 7714.0, + "completions/mean_length": 1304.634765625, + "completions/mean_terminated_length": 1304.634765625, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "epoch": 0.5823867106388176, + "grad_norm": 0.2211151675031987, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 165308222.0, + "reward": 0.732421875, + "reward_std": 0.21952584385871887, + "rewards/accuracy_reward/mean": 0.236328125, + "rewards/accuracy_reward/std": 0.42524150013923645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6066.0, + "completions/max_terminated_length": 6066.0, + "completions/mean_length": 1321.25, + "completions/mean_terminated_length": 1321.25, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.5843410284597532, + "grad_norm": 0.18867251709709362, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 166055998.0, + "reward": 0.787109375, + "reward_std": 0.2185419797897339, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45377036929130554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6931.0, + "completions/max_terminated_length": 6931.0, + "completions/mean_length": 1446.7734375, + "completions/mean_terminated_length": 1446.7734375, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.5862953462806889, + "grad_norm": 0.1746101844825762, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 166874138.0, + "reward": 0.69921875, + "reward_std": 0.1896924078464508, + "rewards/accuracy_reward/mean": 0.201171875, + "rewards/accuracy_reward/std": 0.4012683033943176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5921.0, + "completions/max_terminated_length": 5921.0, + "completions/mean_length": 1090.76171875, + "completions/mean_terminated_length": 1090.76171875, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.5882496641016245, + "grad_norm": 0.2652683767846162, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 167500864.0, + "reward": 0.8134765625, + "reward_std": 0.2617543935775757, + "rewards/accuracy_reward/mean": 0.314453125, + "rewards/accuracy_reward/std": 0.4647517800331116, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4860.0, + "completions/max_terminated_length": 4860.0, + "completions/mean_length": 1186.728515625, + "completions/mean_terminated_length": 1186.728515625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.5902039819225602, + "grad_norm": 0.28742720239369074, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 168176373.0, + "reward": 0.7392578125, + "reward_std": 0.2963982820510864, + "rewards/accuracy_reward/mean": 0.240234375, + "rewards/accuracy_reward/std": 0.4276435375213623, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2625.0, + "completions/max_terminated_length": 2625.0, + "completions/mean_length": 1135.060546875, + "completions/mean_terminated_length": 1135.060546875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.5921582997434958, + "grad_norm": 0.24364090014388193, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 168819428.0, + "reward": 0.7861328125, + "reward_std": 0.2738185524940491, + "rewards/accuracy_reward/mean": 0.287109375, + "rewards/accuracy_reward/std": 0.45285552740097046, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6280.0, + "completions/max_terminated_length": 6280.0, + "completions/mean_length": 1118.4453125, + "completions/mean_terminated_length": 1118.4453125, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.5941126175644315, + "grad_norm": 0.2443718889592878, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 169457320.0, + "reward": 0.80859375, + "reward_std": 0.24863120913505554, + "rewards/accuracy_reward/mean": 0.30859375, + "rewards/accuracy_reward/std": 0.4623647928237915, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5359.0, + "completions/max_terminated_length": 5359.0, + "completions/mean_length": 1251.5234375, + "completions/mean_terminated_length": 1251.5234375, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.596066935385367, + "grad_norm": 0.20779377623108775, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 170168052.0, + "reward": 0.724609375, + "reward_std": 0.2509995698928833, + "rewards/accuracy_reward/mean": 0.224609375, + "rewards/accuracy_reward/std": 0.41773295402526855, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6104.0, + "completions/max_terminated_length": 6104.0, + "completions/mean_length": 1283.875, + "completions/mean_terminated_length": 1286.387451171875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.5980212532063027, + "grad_norm": 0.1766449162986385, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 170887732.0, + "reward": 0.7255859375, + "reward_std": 0.19073061645030975, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4190165400505066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7122.0, + "completions/max_terminated_length": 7122.0, + "completions/mean_length": 1146.49609375, + "completions/mean_terminated_length": 1146.49609375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.5999755710272383, + "grad_norm": 0.2225710480650791, + "learning_rate": 1e-06, + "loss": 0.0299, + "num_tokens": 171546370.0, + "reward": 0.7421875, + "reward_std": 0.186070054769516, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.42882615327835083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5522.0, + "completions/max_terminated_length": 5522.0, + "completions/mean_length": 1208.8046875, + "completions/mean_terminated_length": 1208.8046875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.6019298888481739, + "grad_norm": 0.22296969655975601, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 172231054.0, + "reward": 0.7275390625, + "reward_std": 0.17332613468170166, + "rewards/accuracy_reward/mean": 0.228515625, + "rewards/accuracy_reward/std": 0.4202871024608612, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3854.0, + "completions/max_terminated_length": 3854.0, + "completions/mean_length": 1209.716796875, + "completions/mean_terminated_length": 1212.0841064453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.6038842066691096, + "grad_norm": 0.1606736481114355, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 172915501.0, + "reward": 0.5869140625, + "reward_std": 0.1463484913110733, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6949.0, + "completions/max_terminated_length": 6949.0, + "completions/mean_length": 1279.490234375, + "completions/mean_terminated_length": 1284.5079345703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.6058385244900452, + "grad_norm": 0.18416248262580898, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 173640968.0, + "reward": 0.671875, + "reward_std": 0.22062185406684875, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3584.0, + "completions/max_terminated_length": 3584.0, + "completions/mean_length": 1162.248046875, + "completions/mean_terminated_length": 1164.5224609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.6077928423109809, + "grad_norm": 0.23246408704391325, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 174318775.0, + "reward": 0.77734375, + "reward_std": 0.2426483929157257, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.45004892349243164, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4995.0, + "completions/max_terminated_length": 4995.0, + "completions/mean_length": 1240.5625, + "completions/mean_terminated_length": 1240.5625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.6097471601319164, + "grad_norm": 0.1832784916826205, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 175027335.0, + "reward": 0.7060546875, + "reward_std": 0.1606566309928894, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40557438135147095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4773.0, + "completions/max_terminated_length": 4773.0, + "completions/mean_length": 1180.78125, + "completions/mean_terminated_length": 1183.0919189453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.6117014779528521, + "grad_norm": 0.230562214708346, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 175710535.0, + "reward": 0.6845703125, + "reward_std": 0.2345835268497467, + "rewards/accuracy_reward/mean": 0.185546875, + "rewards/accuracy_reward/std": 0.38912075757980347, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4838.0, + "completions/max_terminated_length": 4838.0, + "completions/mean_length": 1161.8515625, + "completions/mean_terminated_length": 1161.8515625, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.6136557957737877, + "grad_norm": 0.22170886094177467, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 176394603.0, + "reward": 0.603515625, + "reward_std": 0.17395907640457153, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5340.0, + "completions/max_terminated_length": 5340.0, + "completions/mean_length": 1320.169921875, + "completions/mean_terminated_length": 1320.169921875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.6156101135947234, + "grad_norm": 0.20057189309893456, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 177151666.0, + "reward": 0.720703125, + "reward_std": 0.19301357865333557, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 2809.0, + "completions/max_terminated_length": 2809.0, + "completions/mean_length": 1061.544921875, + "completions/mean_terminated_length": 1063.622314453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.617564431415659, + "grad_norm": 0.2723445344761178, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 177763177.0, + "reward": 0.7880859375, + "reward_std": 0.27397823333740234, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45377036929130554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4760.0, + "completions/max_terminated_length": 4760.0, + "completions/mean_length": 1218.640625, + "completions/mean_terminated_length": 1218.640625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6195187492365946, + "grad_norm": 0.18720550185917526, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 178457745.0, + "reward": 0.6376953125, + "reward_std": 0.15164783596992493, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5357.0, + "completions/max_terminated_length": 5357.0, + "completions/mean_length": 1190.13671875, + "completions/mean_terminated_length": 1190.13671875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.6214730670575302, + "grad_norm": 0.2530018249379818, + "learning_rate": 1e-06, + "loss": -0.016, + "num_tokens": 179141959.0, + "reward": 0.669921875, + "reward_std": 0.22028885781764984, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6749.0, + "completions/max_terminated_length": 6749.0, + "completions/mean_length": 1356.705078125, + "completions/mean_terminated_length": 1356.705078125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.6234273848784658, + "grad_norm": 0.19804784312142373, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 179911936.0, + "reward": 0.7353515625, + "reward_std": 0.23102496564388275, + "rewards/accuracy_reward/mean": 0.236328125, + "rewards/accuracy_reward/std": 0.42524150013923645, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4485.0, + "completions/max_terminated_length": 4485.0, + "completions/mean_length": 1196.1484375, + "completions/mean_terminated_length": 1196.1484375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.6253817026994015, + "grad_norm": 0.26914491606466645, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 180586540.0, + "reward": 0.7666015625, + "reward_std": 0.26667365431785583, + "rewards/accuracy_reward/mean": 0.267578125, + "rewards/accuracy_reward/std": 0.4431293308734894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3929.0, + "completions/max_terminated_length": 3929.0, + "completions/mean_length": 1142.578125, + "completions/mean_terminated_length": 1142.578125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.6273360205203371, + "grad_norm": 0.24388057261871532, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 181244932.0, + "reward": 0.83203125, + "reward_std": 0.2915608584880829, + "rewards/accuracy_reward/mean": 0.33203125, + "rewards/accuracy_reward/std": 0.47140273451805115, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3712.0, + "completions/max_terminated_length": 3712.0, + "completions/mean_length": 1058.6796875, + "completions/mean_terminated_length": 1058.6796875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.6292903383412728, + "grad_norm": 0.2737365442286321, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 181857536.0, + "reward": 0.8427734375, + "reward_std": 0.26035070419311523, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4754233956336975, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 5252.0, + "completions/max_terminated_length": 5252.0, + "completions/mean_length": 1055.70703125, + "completions/mean_terminated_length": 1061.9293212890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.6312446561622084, + "grad_norm": 0.2688708501433409, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 182460202.0, + "reward": 0.7900390625, + "reward_std": 0.25557857751846313, + "rewards/accuracy_reward/mean": 0.29296875, + "rewards/accuracy_reward/std": 0.455569326877594, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6176.0, + "completions/max_terminated_length": 6176.0, + "completions/mean_length": 1128.767578125, + "completions/mean_terminated_length": 1130.9765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.633198973983144, + "grad_norm": 0.24340139499492847, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 183102387.0, + "reward": 0.6708984375, + "reward_std": 0.22567912936210632, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5614.0, + "completions/max_terminated_length": 5614.0, + "completions/mean_length": 1137.291015625, + "completions/mean_terminated_length": 1137.291015625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6351532918040796, + "grad_norm": 0.20894579914889078, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 183771304.0, + "reward": 0.619140625, + "reward_std": 0.15983879566192627, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4778.0, + "completions/max_terminated_length": 4778.0, + "completions/mean_length": 1087.748046875, + "completions/mean_terminated_length": 1089.876708984375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.6371076096250152, + "grad_norm": 0.20380511545034277, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 184395415.0, + "reward": 0.63671875, + "reward_std": 0.17275160551071167, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3908.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 1121.318359375, + "completions/mean_terminated_length": 1121.318359375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.6390619274459509, + "grad_norm": 0.19270109166520963, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 185042762.0, + "reward": 0.7109375, + "reward_std": 0.12538030743598938, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4083731174468994, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7179.0, + "completions/max_terminated_length": 7179.0, + "completions/mean_length": 1247.5234375, + "completions/mean_terminated_length": 1249.9647216796875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.6410162452668865, + "grad_norm": 0.21155381539413848, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 185748710.0, + "reward": 0.6875, + "reward_std": 0.2159307301044464, + "rewards/accuracy_reward/mean": 0.189453125, + "rewards/accuracy_reward/std": 0.3922513723373413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 7549.0, + "completions/max_terminated_length": 7549.0, + "completions/mean_length": 1215.3046875, + "completions/mean_terminated_length": 1220.0706787109375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.6429705630878222, + "grad_norm": 0.19949286765163257, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 186439954.0, + "reward": 0.66015625, + "reward_std": 0.15878796577453613, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5604.0, + "completions/max_terminated_length": 5604.0, + "completions/mean_length": 1035.896484375, + "completions/mean_terminated_length": 1035.896484375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.6449248809087578, + "grad_norm": 0.27600909821573316, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 187038893.0, + "reward": 0.666015625, + "reward_std": 0.20721355080604553, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4074.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 1068.13671875, + "completions/mean_terminated_length": 1072.3255615234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.6468791987296935, + "grad_norm": 0.30988286761332945, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 187662227.0, + "reward": 0.697265625, + "reward_std": 0.2212134301662445, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5017.0, + "completions/max_terminated_length": 5017.0, + "completions/mean_length": 1221.240234375, + "completions/mean_terminated_length": 1221.240234375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.648833516550629, + "grad_norm": 0.24317938739142395, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 188368990.0, + "reward": 0.65625, + "reward_std": 0.21825319528579712, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4428.0, + "completions/max_terminated_length": 4428.0, + "completions/mean_length": 1090.25, + "completions/mean_terminated_length": 1090.25, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.6507878343715646, + "grad_norm": 0.31929286473859864, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 189001134.0, + "reward": 0.76953125, + "reward_std": 0.2776387333869934, + "rewards/accuracy_reward/mean": 0.26953125, + "rewards/accuracy_reward/std": 0.44415023922920227, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6194.0, + "completions/max_terminated_length": 6194.0, + "completions/mean_length": 1159.8828125, + "completions/mean_terminated_length": 1162.152587890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.6527421521925003, + "grad_norm": 0.2679230346164363, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 189672162.0, + "reward": 0.6474609375, + "reward_std": 0.16923800110816956, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3883.0, + "completions/max_terminated_length": 3883.0, + "completions/mean_length": 1101.814453125, + "completions/mean_terminated_length": 1101.814453125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6546964700134359, + "grad_norm": 0.33651843292602207, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 190302067.0, + "reward": 0.7626953125, + "reward_std": 0.25743982195854187, + "rewards/accuracy_reward/mean": 0.263671875, + "rewards/accuracy_reward/std": 0.4410543739795685, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 6098.0, + "completions/max_terminated_length": 6098.0, + "completions/mean_length": 1060.287109375, + "completions/mean_terminated_length": 1070.74365234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.6566507878343716, + "grad_norm": 0.3302355836036535, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 190925526.0, + "reward": 0.80078125, + "reward_std": 0.30171141028404236, + "rewards/accuracy_reward/mean": 0.306640625, + "rewards/accuracy_reward/std": 0.4615498185157776, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5958.0, + "completions/max_terminated_length": 5958.0, + "completions/mean_length": 1190.58203125, + "completions/mean_terminated_length": 1195.2510986328125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.6586051056553072, + "grad_norm": 0.21996861286935993, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 191592304.0, + "reward": 0.6640625, + "reward_std": 0.1925305426120758, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7098.0, + "completions/max_terminated_length": 7098.0, + "completions/mean_length": 1240.359375, + "completions/mean_terminated_length": 1240.359375, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.6605594234762429, + "grad_norm": 0.27097441985004145, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 192304840.0, + "reward": 0.763671875, + "reward_std": 0.2036128044128418, + "rewards/accuracy_reward/mean": 0.263671875, + "rewards/accuracy_reward/std": 0.4410543739795685, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5820.0, + "completions/max_terminated_length": 5820.0, + "completions/mean_length": 1176.544921875, + "completions/mean_terminated_length": 1178.8472900390625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.6625137412971784, + "grad_norm": 0.2630711912093372, + "learning_rate": 1e-06, + "loss": -0.0133, + "num_tokens": 192980351.0, + "reward": 0.6484375, + "reward_std": 0.17246973514556885, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3905.0, + "completions/max_terminated_length": 3905.0, + "completions/mean_length": 1065.25390625, + "completions/mean_terminated_length": 1065.25390625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.6644680591181141, + "grad_norm": 0.31506049487139687, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 193596337.0, + "reward": 0.7744140625, + "reward_std": 0.27411314845085144, + "rewards/accuracy_reward/mean": 0.275390625, + "rewards/accuracy_reward/std": 0.44714778661727905, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7397.0, + "completions/max_terminated_length": 7397.0, + "completions/mean_length": 1086.123046875, + "completions/mean_terminated_length": 1088.24853515625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.6664223769390497, + "grad_norm": 0.316903907975029, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 194223424.0, + "reward": 0.6279296875, + "reward_std": 0.1863647848367691, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7682.0, + "completions/max_terminated_length": 7682.0, + "completions/mean_length": 1159.943359375, + "completions/mean_terminated_length": 1162.2132568359375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.6683766947599853, + "grad_norm": 0.3091863491149638, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 194892483.0, + "reward": 0.740234375, + "reward_std": 0.2670055627822876, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.42882615327835083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6399.0, + "completions/max_terminated_length": 6399.0, + "completions/mean_length": 1154.76953125, + "completions/mean_terminated_length": 1154.76953125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.670331012580921, + "grad_norm": 0.3157426047270457, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 195559085.0, + "reward": 0.798828125, + "reward_std": 0.2699702978134155, + "rewards/accuracy_reward/mean": 0.30078125, + "rewards/accuracy_reward/std": 0.45904624462127686, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4826.0, + "completions/max_terminated_length": 4826.0, + "completions/mean_length": 1113.318359375, + "completions/mean_terminated_length": 1117.684326171875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.6722853304018566, + "grad_norm": 0.36358335421870613, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 196193744.0, + "reward": 0.763671875, + "reward_std": 0.2875080108642578, + "rewards/accuracy_reward/mean": 0.267578125, + "rewards/accuracy_reward/std": 0.4431293308734894, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4507.0, + "completions/max_terminated_length": 4507.0, + "completions/mean_length": 1216.673828125, + "completions/mean_terminated_length": 1221.4451904296875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.6742396482227923, + "grad_norm": 0.3001262432277463, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 196877385.0, + "reward": 0.7177734375, + "reward_std": 0.2189716249704361, + "rewards/accuracy_reward/mean": 0.220703125, + "rewards/accuracy_reward/std": 0.4151262938976288, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7004.0, + "completions/max_terminated_length": 7004.0, + "completions/mean_length": 1211.314453125, + "completions/mean_terminated_length": 1211.314453125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.6761939660437278, + "grad_norm": 0.2302453296097891, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 197560122.0, + "reward": 0.6494140625, + "reward_std": 0.19137297570705414, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7214.0, + "completions/max_terminated_length": 7214.0, + "completions/mean_length": 1172.205078125, + "completions/mean_terminated_length": 1172.205078125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.6781482838646635, + "grad_norm": 0.28634503877155254, + "learning_rate": 1e-06, + "loss": 0.009, + "num_tokens": 198232851.0, + "reward": 0.677734375, + "reward_std": 0.1759292334318161, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4025.0, + "completions/max_terminated_length": 4025.0, + "completions/mean_length": 1132.541015625, + "completions/mean_terminated_length": 1134.75732421875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.6801026016855991, + "grad_norm": 0.208641834615973, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 198881016.0, + "reward": 0.6220703125, + "reward_std": 0.13517014682292938, + "rewards/accuracy_reward/mean": 0.123046875, + "rewards/accuracy_reward/std": 0.32881227135658264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 7757.0, + "completions/max_terminated_length": 7757.0, + "completions/mean_length": 1266.953125, + "completions/mean_terminated_length": 1279.44775390625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.6820569195065348, + "grad_norm": 0.22912392510267646, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 199599408.0, + "reward": 0.62109375, + "reward_std": 0.18424922227859497, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5232.0, + "completions/max_terminated_length": 5232.0, + "completions/mean_length": 1082.787109375, + "completions/mean_terminated_length": 1084.906005859375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.6840112373274704, + "grad_norm": 0.2974567336637007, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 200221987.0, + "reward": 0.6533203125, + "reward_std": 0.2106063961982727, + "rewards/accuracy_reward/mean": 0.154296875, + "rewards/accuracy_reward/std": 0.36158639192581177, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7924.0, + "completions/max_terminated_length": 7924.0, + "completions/mean_length": 1190.140625, + "completions/mean_terminated_length": 1192.4696044921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.685965555148406, + "grad_norm": 0.2890625223785535, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 200891227.0, + "reward": 0.705078125, + "reward_std": 0.2059016078710556, + "rewards/accuracy_reward/mean": 0.208984375, + "rewards/accuracy_reward/std": 0.40698084235191345, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5089.0, + "completions/max_terminated_length": 5089.0, + "completions/mean_length": 1138.900390625, + "completions/mean_terminated_length": 1138.900390625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6879198729693417, + "grad_norm": 0.2709210267940066, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 201537400.0, + "reward": 0.6484375, + "reward_std": 0.1775255799293518, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5862.0, + "completions/max_terminated_length": 5862.0, + "completions/mean_length": 1111.69921875, + "completions/mean_terminated_length": 1116.058837890625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6898741907902772, + "grad_norm": 0.3057721600626253, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 202178526.0, + "reward": 0.71484375, + "reward_std": 0.23337948322296143, + "rewards/accuracy_reward/mean": 0.216796875, + "rewards/accuracy_reward/std": 0.4124660789966583, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6471.0, + "completions/max_terminated_length": 6471.0, + "completions/mean_length": 1116.447265625, + "completions/mean_terminated_length": 1116.447265625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.6918285086112129, + "grad_norm": 0.33387450968920596, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 202818371.0, + "reward": 0.6962890625, + "reward_std": 0.24100756645202637, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6678.0, + "completions/max_terminated_length": 6678.0, + "completions/mean_length": 1145.73828125, + "completions/mean_terminated_length": 1145.73828125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.6937828264321485, + "grad_norm": 0.233447405777052, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 203473133.0, + "reward": 0.7177734375, + "reward_std": 0.18659594655036926, + "rewards/accuracy_reward/mean": 0.22265625, + "rewards/accuracy_reward/std": 0.41643625497817993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 7348.0, + "completions/max_terminated_length": 7348.0, + "completions/mean_length": 1171.61328125, + "completions/mean_terminated_length": 1176.2078857421875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.6957371442530842, + "grad_norm": 0.3246604912618191, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 204153207.0, + "reward": 0.619140625, + "reward_std": 0.14826922118663788, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4440.0, + "completions/max_terminated_length": 4440.0, + "completions/mean_length": 1105.12890625, + "completions/mean_terminated_length": 1107.2916259765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.6976914620740198, + "grad_norm": 0.2554949860592303, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 204801497.0, + "reward": 0.63671875, + "reward_std": 0.14479780197143555, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 5109.0, + "completions/max_terminated_length": 5109.0, + "completions/mean_length": 1016.69921875, + "completions/mean_terminated_length": 1022.6915893554688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.6996457798949555, + "grad_norm": 0.4028040106428704, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 205406303.0, + "reward": 0.67578125, + "reward_std": 0.18319003283977509, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4383.0, + "completions/max_terminated_length": 4383.0, + "completions/mean_length": 1106.552734375, + "completions/mean_terminated_length": 1108.7181396484375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.701600097715891, + "grad_norm": 0.18934099428437415, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 206039914.0, + "reward": 0.5693359375, + "reward_std": 0.086665078997612, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.25592297315597534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7569.0, + "completions/max_terminated_length": 7569.0, + "completions/mean_length": 1113.005859375, + "completions/mean_terminated_length": 1113.005859375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.7035544155368266, + "grad_norm": 0.31215443631859846, + "learning_rate": 1e-06, + "loss": 0.0313, + "num_tokens": 206699053.0, + "reward": 0.6396484375, + "reward_std": 0.190708190202713, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7348.0, + "completions/max_terminated_length": 7348.0, + "completions/mean_length": 986.458984375, + "completions/mean_terminated_length": 986.458984375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.7055087333577623, + "grad_norm": 0.40482079560855483, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 207310072.0, + "reward": 0.79296875, + "reward_std": 0.2537032961845398, + "rewards/accuracy_reward/mean": 0.294921875, + "rewards/accuracy_reward/std": 0.4564536213874817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6662.0, + "completions/max_terminated_length": 6662.0, + "completions/mean_length": 1036.123046875, + "completions/mean_terminated_length": 1040.186279296875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.7074630511786979, + "grad_norm": 0.27740156795336113, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 207913703.0, + "reward": 0.66796875, + "reward_std": 0.19501295685768127, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4425.0, + "completions/max_terminated_length": 4425.0, + "completions/mean_length": 993.97265625, + "completions/mean_terminated_length": 993.97265625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.7094173689996336, + "grad_norm": 0.25453413879873255, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 208491785.0, + "reward": 0.6259765625, + "reward_std": 0.17519697546958923, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2897.0, + "completions/max_terminated_length": 2897.0, + "completions/mean_length": 882.705078125, + "completions/mean_terminated_length": 882.705078125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.7113716868205692, + "grad_norm": 0.4145188913728128, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 209006546.0, + "reward": 0.70703125, + "reward_std": 0.24387207627296448, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40557438135147095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4103.0, + "completions/max_terminated_length": 4103.0, + "completions/mean_length": 1078.666015625, + "completions/mean_terminated_length": 1082.8961181640625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.7133260046415049, + "grad_norm": 0.3043131969924369, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 209636759.0, + "reward": 0.7841796875, + "reward_std": 0.26631104946136475, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45377036929130554, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4058.0, + "completions/max_terminated_length": 4058.0, + "completions/mean_length": 1104.177734375, + "completions/mean_terminated_length": 1106.3385009765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.7152803224624404, + "grad_norm": 0.2725181304859212, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 210286690.0, + "reward": 0.5703125, + "reward_std": 0.14889013767242432, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2192.0, + "completions/max_terminated_length": 2192.0, + "completions/mean_length": 943.48046875, + "completions/mean_terminated_length": 943.48046875, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7172346402833761, + "grad_norm": 0.3578454161608576, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 210861256.0, + "reward": 0.779296875, + "reward_std": 0.26142898201942444, + "rewards/accuracy_reward/mean": 0.279296875, + "rewards/accuracy_reward/std": 0.44909247756004333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6655.0, + "completions/max_terminated_length": 6655.0, + "completions/mean_length": 1009.62890625, + "completions/mean_terminated_length": 1011.6046752929688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.7191889581043117, + "grad_norm": 0.31400983798116405, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 211449930.0, + "reward": 0.6650390625, + "reward_std": 0.1915016770362854, + "rewards/accuracy_reward/mean": 0.166015625, + "rewards/accuracy_reward/std": 0.3724585771560669, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4456.0, + "completions/max_terminated_length": 4456.0, + "completions/mean_length": 1046.90234375, + "completions/mean_terminated_length": 1048.9510498046875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.7211432759252473, + "grad_norm": 0.2638251558380704, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 212071416.0, + "reward": 0.61328125, + "reward_std": 0.13417956233024597, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6430.0, + "completions/max_terminated_length": 6430.0, + "completions/mean_length": 1122.85546875, + "completions/mean_terminated_length": 1125.0528564453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.723097593746183, + "grad_norm": 0.322451995568548, + "learning_rate": 1e-06, + "loss": -0.0349, + "num_tokens": 212716638.0, + "reward": 0.8271484375, + "reward_std": 0.2919270396232605, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4699897766113281, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5077.0, + "completions/max_terminated_length": 5077.0, + "completions/mean_length": 924.083984375, + "completions/mean_terminated_length": 924.083984375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.7250519115671186, + "grad_norm": 0.36804551396783375, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 213255129.0, + "reward": 0.7158203125, + "reward_std": 0.2833227515220642, + "rewards/accuracy_reward/mean": 0.216796875, + "rewards/accuracy_reward/std": 0.4124660789966583, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6508.0, + "completions/max_terminated_length": 6508.0, + "completions/mean_length": 886.998046875, + "completions/mean_terminated_length": 892.2259521484375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.7270062293880543, + "grad_norm": 0.4301408532662571, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 213777080.0, + "reward": 0.748046875, + "reward_std": 0.25287139415740967, + "rewards/accuracy_reward/mean": 0.251953125, + "rewards/accuracy_reward/std": 0.43455907702445984, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6980.0, + "completions/max_terminated_length": 6980.0, + "completions/mean_length": 1071.779296875, + "completions/mean_terminated_length": 1078.0963134765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.7289605472089898, + "grad_norm": 0.3604566797035827, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 214391303.0, + "reward": 0.7646484375, + "reward_std": 0.2593989968299866, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.4461594223976135, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.982421875, + "rewards/soft_format_reward/std": 0.13154059648513794, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 2233.0, + "completions/max_terminated_length": 2233.0, + "completions/mean_length": 876.251953125, + "completions/mean_terminated_length": 877.9667358398438, + "completions/min_length": 0.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.7309148650299255, + "grad_norm": 0.40624269339990965, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 214914952.0, + "reward": 0.6787109375, + "reward_std": 0.2522028982639313, + "rewards/accuracy_reward/mean": 0.181640625, + "rewards/accuracy_reward/std": 0.38592514395713806, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4847.0, + "completions/max_terminated_length": 4847.0, + "completions/mean_length": 990.787109375, + "completions/mean_terminated_length": 990.787109375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.7328691828508611, + "grad_norm": 0.4541765929616306, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 215482923.0, + "reward": 0.8212890625, + "reward_std": 0.31638646125793457, + "rewards/accuracy_reward/mean": 0.322265625, + "rewards/accuracy_reward/std": 0.46780112385749817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7344.0, + "completions/max_terminated_length": 7344.0, + "completions/mean_length": 1022.345703125, + "completions/mean_terminated_length": 1022.345703125, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.7348235006717968, + "grad_norm": 0.3970455581004104, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 216068492.0, + "reward": 0.69140625, + "reward_std": 0.1918405294418335, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6027.0, + "completions/max_terminated_length": 6027.0, + "completions/mean_length": 1009.193359375, + "completions/mean_terminated_length": 1011.1682739257812, + "completions/min_length": 0.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.7367778184927324, + "grad_norm": 0.314671555466819, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 216653167.0, + "reward": 0.58984375, + "reward_std": 0.1450338065624237, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7100.0, + "completions/max_terminated_length": 7100.0, + "completions/mean_length": 1019.45703125, + "completions/mean_terminated_length": 1021.4520263671875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.738732136313668, + "grad_norm": 0.32555850082687493, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 217243481.0, + "reward": 0.666015625, + "reward_std": 0.18606624007225037, + "rewards/accuracy_reward/mean": 0.16796875, + "rewards/accuracy_reward/std": 0.374204158782959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3465.0, + "completions/max_terminated_length": 3465.0, + "completions/mean_length": 992.78515625, + "completions/mean_terminated_length": 992.78515625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.7406864541346037, + "grad_norm": 0.36980852340566556, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 217818523.0, + "reward": 0.638671875, + "reward_std": 0.20307320356369019, + "rewards/accuracy_reward/mean": 0.138671875, + "rewards/accuracy_reward/std": 0.34594178199768066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6222.0, + "completions/max_terminated_length": 6222.0, + "completions/mean_length": 943.513671875, + "completions/mean_terminated_length": 949.07470703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7426407719555392, + "grad_norm": 0.24487953479880442, + "learning_rate": 1e-06, + "loss": -0.0178, + "num_tokens": 218378130.0, + "reward": 0.5322265625, + "reward_std": 0.09823274612426758, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4980.0, + "completions/max_terminated_length": 4980.0, + "completions/mean_length": 995.0859375, + "completions/mean_terminated_length": 997.0332641601562, + "completions/min_length": 0.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.7445950897764749, + "grad_norm": 0.2965074530884303, + "learning_rate": 1e-06, + "loss": -0.0148, + "num_tokens": 218957918.0, + "reward": 0.6845703125, + "reward_std": 0.19378219544887543, + "rewards/accuracy_reward/mean": 0.185546875, + "rewards/accuracy_reward/std": 0.38912075757980347, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3038.0, + "completions/max_terminated_length": 3038.0, + "completions/mean_length": 933.126953125, + "completions/mean_terminated_length": 936.7863159179688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.7465494075974105, + "grad_norm": 0.32534658578356584, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 219505295.0, + "reward": 0.609375, + "reward_std": 0.12045562267303467, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6350.0, + "completions/max_terminated_length": 6350.0, + "completions/mean_length": 1036.14453125, + "completions/mean_terminated_length": 1038.1722412109375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7485037254183462, + "grad_norm": 0.3884019634144294, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 220106329.0, + "reward": 0.615234375, + "reward_std": 0.18547414243221283, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6318.0, + "completions/max_terminated_length": 6318.0, + "completions/mean_length": 958.767578125, + "completions/mean_terminated_length": 958.767578125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.7504580432392818, + "grad_norm": 0.3261956315256191, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 220657874.0, + "reward": 0.6181640625, + "reward_std": 0.1624271422624588, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4933.0, + "completions/max_terminated_length": 4933.0, + "completions/mean_length": 906.39453125, + "completions/mean_terminated_length": 908.1682739257812, + "completions/min_length": 0.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.7524123610602175, + "grad_norm": 0.518211748980467, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 221192156.0, + "reward": 0.66015625, + "reward_std": 0.15569275617599487, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4788.0, + "completions/max_terminated_length": 4788.0, + "completions/mean_length": 1028.16796875, + "completions/mean_terminated_length": 1028.16796875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.754366678881153, + "grad_norm": 0.4206041115558596, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 221793666.0, + "reward": 0.6025390625, + "reward_std": 0.18358266353607178, + "rewards/accuracy_reward/mean": 0.103515625, + "rewards/accuracy_reward/std": 0.30492907762527466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 6583.0, + "completions/max_terminated_length": 6583.0, + "completions/mean_length": 953.21875, + "completions/mean_terminated_length": 960.7244262695312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.7563209967020886, + "grad_norm": 0.33537772353898526, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 222362930.0, + "reward": 0.634765625, + "reward_std": 0.11699375510215759, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3864.0, + "completions/max_terminated_length": 3864.0, + "completions/mean_length": 929.021484375, + "completions/mean_terminated_length": 932.6647338867188, + "completions/min_length": 0.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7582753145230243, + "grad_norm": 0.37255690277059367, + "learning_rate": 1e-06, + "loss": -0.0178, + "num_tokens": 222911197.0, + "reward": 0.6123046875, + "reward_std": 0.14915111660957336, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4628.0, + "completions/max_terminated_length": 4628.0, + "completions/mean_length": 938.017578125, + "completions/mean_terminated_length": 941.6961059570312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.7602296323439599, + "grad_norm": 0.4258639055357595, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 223460022.0, + "reward": 0.6357421875, + "reward_std": 0.18806537985801697, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3479743003845215, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4274.0, + "completions/max_terminated_length": 4274.0, + "completions/mean_length": 913.390625, + "completions/mean_terminated_length": 915.1781005859375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.7621839501648956, + "grad_norm": 0.3666960447389148, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 223996318.0, + "reward": 0.611328125, + "reward_std": 0.16379114985466003, + "rewards/accuracy_reward/mean": 0.11328125, + "rewards/accuracy_reward/std": 0.3172462284564972, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6970.0, + "completions/max_terminated_length": 6970.0, + "completions/mean_length": 979.921875, + "completions/mean_terminated_length": 981.8395385742188, + "completions/min_length": 0.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.7641382679858312, + "grad_norm": 0.3684451106834205, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 224571974.0, + "reward": 0.654296875, + "reward_std": 0.19255173206329346, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4276.0, + "completions/max_terminated_length": 4276.0, + "completions/mean_length": 869.150390625, + "completions/mean_terminated_length": 872.5588989257812, + "completions/min_length": 0.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.7660925858067669, + "grad_norm": 0.34728342736467754, + "learning_rate": 1e-06, + "loss": -0.0205, + "num_tokens": 225093939.0, + "reward": 0.609375, + "reward_std": 0.15344160795211792, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7110.0, + "completions/max_terminated_length": 7110.0, + "completions/mean_length": 871.859375, + "completions/mean_terminated_length": 873.5655517578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.7680469036277024, + "grad_norm": 0.43154462615966055, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 225608219.0, + "reward": 0.6826171875, + "reward_std": 0.16245341300964355, + "rewards/accuracy_reward/mean": 0.185546875, + "rewards/accuracy_reward/std": 0.38912075757980347, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 3407.0, + "completions/max_terminated_length": 3407.0, + "completions/mean_length": 967.962890625, + "completions/mean_terminated_length": 977.5089111328125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.7700012214486381, + "grad_norm": 0.4027808061139334, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 226172456.0, + "reward": 0.5859375, + "reward_std": 0.12922191619873047, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7499.0, + "completions/max_terminated_length": 7499.0, + "completions/mean_length": 922.8125, + "completions/mean_terminated_length": 924.618408203125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7719555392695737, + "grad_norm": 0.691596988533144, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 226714664.0, + "reward": 0.6416015625, + "reward_std": 0.1713179498910904, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5388.0, + "completions/max_terminated_length": 5388.0, + "completions/mean_length": 958.908203125, + "completions/mean_terminated_length": 958.908203125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.7739098570905093, + "grad_norm": 0.37674261959260547, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 227279177.0, + "reward": 0.646484375, + "reward_std": 0.19885963201522827, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2565.0, + "completions/max_terminated_length": 2565.0, + "completions/mean_length": 952.6640625, + "completions/mean_terminated_length": 952.6640625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.775864174911445, + "grad_norm": 0.40555615158616737, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 227834461.0, + "reward": 0.650390625, + "reward_std": 0.14763301610946655, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7856.0, + "completions/max_terminated_length": 7856.0, + "completions/mean_length": 910.412109375, + "completions/mean_terminated_length": 917.5806884765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.7778184927323806, + "grad_norm": 0.31676714671486106, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 228370448.0, + "reward": 0.615234375, + "reward_std": 0.15904265642166138, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6044.0, + "completions/max_terminated_length": 6044.0, + "completions/mean_length": 909.666015625, + "completions/mean_terminated_length": 915.0275268554688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.7797728105533163, + "grad_norm": 0.4301922701918752, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 228907285.0, + "reward": 0.630859375, + "reward_std": 0.1643366813659668, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 7297.0, + "completions/max_terminated_length": 7297.0, + "completions/mean_length": 1086.357421875, + "completions/mean_terminated_length": 1090.61767578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.7817271283742518, + "grad_norm": 0.31878737353186365, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 229533692.0, + "reward": 0.6142578125, + "reward_std": 0.17186219990253448, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6899.0, + "completions/max_terminated_length": 6899.0, + "completions/mean_length": 943.283203125, + "completions/mean_terminated_length": 948.8428344726562, + "completions/min_length": 0.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.7836814461951875, + "grad_norm": 0.4735750958821543, + "learning_rate": 1e-06, + "loss": -0.0207, + "num_tokens": 230090557.0, + "reward": 0.6552734375, + "reward_std": 0.17208826541900635, + "rewards/accuracy_reward/mean": 0.158203125, + "rewards/accuracy_reward/std": 0.36528825759887695, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 5936.0, + "completions/max_terminated_length": 5936.0, + "completions/mean_length": 1051.05078125, + "completions/mean_terminated_length": 1059.3267822265625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.7856357640161231, + "grad_norm": 0.25808361902275967, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 230694583.0, + "reward": 0.6357421875, + "reward_std": 0.14681567251682281, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 2561.0, + "completions/max_terminated_length": 2561.0, + "completions/mean_length": 932.59765625, + "completions/mean_terminated_length": 938.0943603515625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7875900818370587, + "grad_norm": 0.338524126395727, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 231238361.0, + "reward": 0.6318359375, + "reward_std": 0.12664470076560974, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6302.0, + "completions/max_terminated_length": 6302.0, + "completions/mean_length": 1040.21875, + "completions/mean_terminated_length": 1042.25439453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.7895443996579944, + "grad_norm": 0.29982835429117166, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 231841929.0, + "reward": 0.6875, + "reward_std": 0.1078319400548935, + "rewards/accuracy_reward/mean": 0.189453125, + "rewards/accuracy_reward/std": 0.3922513723373413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3213.0, + "completions/max_terminated_length": 3213.0, + "completions/mean_length": 1009.005859375, + "completions/mean_terminated_length": 1012.9628295898438, + "completions/min_length": 0.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.79149871747893, + "grad_norm": 0.29425661365524536, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 232429724.0, + "reward": 0.640625, + "reward_std": 0.11602403223514557, + "rewards/accuracy_reward/mean": 0.142578125, + "rewards/accuracy_reward/std": 0.3499840497970581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4406.0, + "completions/max_terminated_length": 4406.0, + "completions/mean_length": 912.421875, + "completions/mean_terminated_length": 916.0000610351562, + "completions/min_length": 0.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.7934530352998657, + "grad_norm": 0.37854393112195983, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 232967988.0, + "reward": 0.708984375, + "reward_std": 0.16139957308769226, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4083731174468994, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2589.0, + "completions/max_terminated_length": 2589.0, + "completions/mean_length": 960.513671875, + "completions/mean_terminated_length": 964.2804565429688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.7954073531208012, + "grad_norm": 0.3332586191909758, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 233538939.0, + "reward": 0.6435546875, + "reward_std": 0.16915643215179443, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6534.0, + "completions/max_terminated_length": 6534.0, + "completions/mean_length": 981.3984375, + "completions/mean_terminated_length": 983.3189697265625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.7973616709417369, + "grad_norm": 0.3643028231648123, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 234117927.0, + "reward": 0.5400390625, + "reward_std": 0.1113402247428894, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5601.0, + "completions/max_terminated_length": 5601.0, + "completions/mean_length": 1029.033203125, + "completions/mean_terminated_length": 1031.0469970703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.7993159887626725, + "grad_norm": 0.27210998043712176, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 234715864.0, + "reward": 0.5634765625, + "reward_std": 0.14024868607521057, + "rewards/accuracy_reward/mean": 0.06640625, + "rewards/accuracy_reward/std": 0.2492343932390213, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2318.0, + "completions/max_terminated_length": 2318.0, + "completions/mean_length": 866.82421875, + "completions/mean_terminated_length": 866.82421875, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.8012703065836082, + "grad_norm": 0.3037792118512972, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 235231710.0, + "reward": 0.556640625, + "reward_std": 0.10441835969686508, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6134.0, + "completions/max_terminated_length": 6134.0, + "completions/mean_length": 1042.263671875, + "completions/mean_terminated_length": 1042.263671875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.8032246244045438, + "grad_norm": 0.21343908640318268, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 235845365.0, + "reward": 0.578125, + "reward_std": 0.10139063745737076, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2386.0, + "completions/max_terminated_length": 2386.0, + "completions/mean_length": 923.861328125, + "completions/mean_terminated_length": 923.861328125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8051789422254794, + "grad_norm": 0.4246737800041493, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 236386622.0, + "reward": 0.6953125, + "reward_std": 0.2369774878025055, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3968288004398346, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4719.0, + "completions/max_terminated_length": 4719.0, + "completions/mean_length": 1059.751953125, + "completions/mean_terminated_length": 1061.8258056640625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.8071332600464151, + "grad_norm": 0.3174297709452347, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 237001903.0, + "reward": 0.64453125, + "reward_std": 0.12615203857421875, + "rewards/accuracy_reward/mean": 0.146484375, + "rewards/accuracy_reward/std": 0.35393697023391724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4971.0, + "completions/max_terminated_length": 4971.0, + "completions/mean_length": 964.205078125, + "completions/mean_terminated_length": 966.0919799804688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.8090875778673506, + "grad_norm": 0.32884289669794287, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 237566152.0, + "reward": 0.623046875, + "reward_std": 0.1059093326330185, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6131.0, + "completions/max_terminated_length": 6131.0, + "completions/mean_length": 1009.841796875, + "completions/mean_terminated_length": 1013.802001953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.8110418956882863, + "grad_norm": 0.3367485681249713, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 238155703.0, + "reward": 0.5947265625, + "reward_std": 0.1472226232290268, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3666.0, + "completions/max_terminated_length": 3666.0, + "completions/mean_length": 956.623046875, + "completions/mean_terminated_length": 956.623046875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.8129962135092219, + "grad_norm": 0.33689574748751255, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 238714614.0, + "reward": 0.6015625, + "reward_std": 0.1009584367275238, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7290.0, + "completions/max_terminated_length": 7290.0, + "completions/mean_length": 997.08984375, + "completions/mean_terminated_length": 1002.9666137695312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8149505313301576, + "grad_norm": 0.3948622983106675, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 239295092.0, + "reward": 0.6142578125, + "reward_std": 0.2135741412639618, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2532.0, + "completions/max_terminated_length": 2532.0, + "completions/mean_length": 1012.25390625, + "completions/mean_terminated_length": 1012.25390625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.8169048491510932, + "grad_norm": 0.33794210059958724, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 239884838.0, + "reward": 0.5966796875, + "reward_std": 0.17836013436317444, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6145.0, + "completions/max_terminated_length": 6145.0, + "completions/mean_length": 912.123046875, + "completions/mean_terminated_length": 915.7000732421875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.8188591669720289, + "grad_norm": 0.34581372400674354, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 240419925.0, + "reward": 0.689453125, + "reward_std": 0.17225705087184906, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4796.0, + "completions/max_terminated_length": 4796.0, + "completions/mean_length": 966.2265625, + "completions/mean_terminated_length": 968.117431640625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.8208134847929645, + "grad_norm": 0.3500936745741667, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 241005449.0, + "reward": 0.6201171875, + "reward_std": 0.10935501754283905, + "rewards/accuracy_reward/mean": 0.12109375, + "rewards/accuracy_reward/std": 0.3265552520751953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 7805.0, + "completions/max_terminated_length": 7805.0, + "completions/mean_length": 991.44140625, + "completions/mean_terminated_length": 1001.2189331054688, + "completions/min_length": 0.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.8227678026139, + "grad_norm": 0.38934485719330975, + "learning_rate": 1e-06, + "loss": -0.0282, + "num_tokens": 241587275.0, + "reward": 0.60546875, + "reward_std": 0.13954401016235352, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6014.0, + "completions/max_terminated_length": 6014.0, + "completions/mean_length": 977.8828125, + "completions/mean_terminated_length": 983.6464233398438, + "completions/min_length": 0.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.8247221204348357, + "grad_norm": 0.3433169182244252, + "learning_rate": 1e-06, + "loss": -0.0237, + "num_tokens": 242158511.0, + "reward": 0.7021484375, + "reward_std": 0.1866951286792755, + "rewards/accuracy_reward/mean": 0.205078125, + "rewards/accuracy_reward/std": 0.4041535556316376, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7744.0, + "completions/max_terminated_length": 7744.0, + "completions/mean_length": 1109.650390625, + "completions/mean_terminated_length": 1109.650390625, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.8266764382557713, + "grad_norm": 0.08897198593864354, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 242805532.0, + "reward": 0.5087890625, + "reward_std": 0.02662741020321846, + "rewards/accuracy_reward/mean": 0.009765625, + "rewards/accuracy_reward/std": 0.09843364357948303, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4253.0, + "completions/max_terminated_length": 4253.0, + "completions/mean_length": 994.1171875, + "completions/mean_terminated_length": 996.0626220703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.828630756076707, + "grad_norm": 0.25036550299089266, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 243407800.0, + "reward": 0.6279296875, + "reward_std": 0.10964851081371307, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6195.0, + "completions/max_terminated_length": 6195.0, + "completions/mean_length": 1066.853515625, + "completions/mean_terminated_length": 1068.9412841796875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.8305850738976426, + "grad_norm": 0.29753793308091675, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 244027133.0, + "reward": 0.6474609375, + "reward_std": 0.1495560109615326, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7208.0, + "completions/max_terminated_length": 7208.0, + "completions/mean_length": 973.31640625, + "completions/mean_terminated_length": 975.2211303710938, + "completions/min_length": 0.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.8325393917185783, + "grad_norm": 0.21933019531048625, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 244606527.0, + "reward": 0.5556640625, + "reward_std": 0.08307676017284393, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5318.0, + "completions/max_terminated_length": 5318.0, + "completions/mean_length": 1101.78515625, + "completions/mean_terminated_length": 1103.9412841796875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.8344937095395139, + "grad_norm": 0.16589801340342616, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 245245025.0, + "reward": 0.6142578125, + "reward_std": 0.06172895431518555, + "rewards/accuracy_reward/mean": 0.115234375, + "rewards/accuracy_reward/std": 0.3196168541908264, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5287.0, + "completions/max_terminated_length": 5287.0, + "completions/mean_length": 1136.810546875, + "completions/mean_terminated_length": 1141.2686767578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.8364480273604495, + "grad_norm": 0.2674423155253901, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 245909104.0, + "reward": 0.5302734375, + "reward_std": 0.08664971590042114, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5090.0, + "completions/max_terminated_length": 5090.0, + "completions/mean_length": 1063.6015625, + "completions/mean_terminated_length": 1067.7725830078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.8384023451813851, + "grad_norm": 0.22748378629166094, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 246538212.0, + "reward": 0.552734375, + "reward_std": 0.10975532233715057, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6370.0, + "completions/max_terminated_length": 6370.0, + "completions/mean_length": 1168.16015625, + "completions/mean_terminated_length": 1170.4461669921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.8403566630023207, + "grad_norm": 0.175614187020851, + "learning_rate": 1e-06, + "loss": -0.0136, + "num_tokens": 247228198.0, + "reward": 0.5595703125, + "reward_std": 0.1177278533577919, + "rewards/accuracy_reward/mean": 0.060546875, + "rewards/accuracy_reward/std": 0.2387305200099945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7028.0, + "completions/max_terminated_length": 7028.0, + "completions/mean_length": 1195.5, + "completions/mean_terminated_length": 1202.5462646484375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.8423109808232564, + "grad_norm": 0.23695792193235804, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 247914806.0, + "reward": 0.5146484375, + "reward_std": 0.07785187661647797, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.1385180652141571, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6903.0, + "completions/max_terminated_length": 6903.0, + "completions/mean_length": 1127.373046875, + "completions/mean_terminated_length": 1129.5792236328125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.844265298644192, + "grad_norm": 0.11663515092017687, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 248569461.0, + "reward": 0.5107421875, + "reward_std": 0.03498993441462517, + "rewards/accuracy_reward/mean": 0.01171875, + "rewards/accuracy_reward/std": 0.10772226005792618, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3344.0, + "completions/max_terminated_length": 3344.0, + "completions/mean_length": 1037.2578125, + "completions/mean_terminated_length": 1037.2578125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.8462196164651277, + "grad_norm": 0.28594613617628284, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 249219401.0, + "reward": 0.6337890625, + "reward_std": 0.1702260971069336, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5211.0, + "completions/max_terminated_length": 5211.0, + "completions/mean_length": 1075.044921875, + "completions/mean_terminated_length": 1079.2608642578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.8481739342860632, + "grad_norm": 0.22526615196834204, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 249839280.0, + "reward": 0.6259765625, + "reward_std": 0.10403385758399963, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3425.0, + "completions/max_terminated_length": 3425.0, + "completions/mean_length": 1031.0234375, + "completions/mean_terminated_length": 1031.0234375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.8501282521069989, + "grad_norm": 0.17401951468556728, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 250424572.0, + "reward": 0.55078125, + "reward_std": 0.07893750816583633, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7366.0, + "completions/max_terminated_length": 7366.0, + "completions/mean_length": 1093.98828125, + "completions/mean_terminated_length": 1100.4361572265625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.8520825699279345, + "grad_norm": 0.2578315372381993, + "learning_rate": 1e-06, + "loss": -0.0221, + "num_tokens": 251055862.0, + "reward": 0.6025390625, + "reward_std": 0.139075368642807, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2985.0, + "completions/max_terminated_length": 2985.0, + "completions/mean_length": 964.228515625, + "completions/mean_terminated_length": 968.0098876953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.8540368877488702, + "grad_norm": 0.21497055676655763, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 251647435.0, + "reward": 0.5859375, + "reward_std": 0.11568523943424225, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7944.0, + "completions/max_terminated_length": 7944.0, + "completions/mean_length": 1086.162109375, + "completions/mean_terminated_length": 1092.5638427734375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.8559912055698058, + "grad_norm": 0.2878423765376644, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 252289726.0, + "reward": 0.59765625, + "reward_std": 0.16791057586669922, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.30236753821372986, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5423.0, + "completions/max_terminated_length": 5423.0, + "completions/mean_length": 1212.712890625, + "completions/mean_terminated_length": 1217.46875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.8579455233907414, + "grad_norm": 0.11133652050631727, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 252984811.0, + "reward": 0.5322265625, + "reward_std": 0.042333416640758514, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6565.0, + "completions/max_terminated_length": 6565.0, + "completions/mean_length": 1056.515625, + "completions/mean_terminated_length": 1060.658935546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.8598998412116771, + "grad_norm": 0.16602797567773786, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 253607971.0, + "reward": 0.552734375, + "reward_std": 0.05688370764255524, + "rewards/accuracy_reward/mean": 0.056640625, + "rewards/accuracy_reward/std": 0.23138070106506348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 5915.0, + "completions/max_terminated_length": 5915.0, + "completions/mean_length": 1055.13671875, + "completions/mean_terminated_length": 1061.3555908203125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.8618541590326126, + "grad_norm": 0.1293290685371719, + "learning_rate": 1e-06, + "loss": -0.0241, + "num_tokens": 254226089.0, + "reward": 0.5107421875, + "reward_std": 0.044180579483509064, + "rewards/accuracy_reward/mean": 0.013671875, + "rewards/accuracy_reward/std": 0.1162383034825325, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 5173.0, + "completions/max_terminated_length": 5173.0, + "completions/mean_length": 1010.29296875, + "completions/mean_terminated_length": 1020.2564086914062, + "completions/min_length": 0.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.8638084768535483, + "grad_norm": 0.23706741657214334, + "learning_rate": 1e-06, + "loss": -0.0292, + "num_tokens": 254803775.0, + "reward": 0.5419921875, + "reward_std": 0.11632746458053589, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21157780289649963, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5251.0, + "completions/max_terminated_length": 5251.0, + "completions/mean_length": 1106.822265625, + "completions/mean_terminated_length": 1106.822265625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.8657627946744839, + "grad_norm": 0.16200383248137007, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 255454980.0, + "reward": 0.572265625, + "reward_std": 0.13231495022773743, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7108.0, + "completions/max_terminated_length": 7108.0, + "completions/mean_length": 1076.23046875, + "completions/mean_terminated_length": 1078.3365478515625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8677171124954196, + "grad_norm": 0.1654129731207247, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 256076010.0, + "reward": 0.5615234375, + "reward_std": 0.0684361383318901, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.2422981858253479, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 6809.0, + "completions/max_terminated_length": 6809.0, + "completions/mean_length": 1159.08203125, + "completions/mean_terminated_length": 1172.826171875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.8696714303163552, + "grad_norm": 0.21249686143049437, + "learning_rate": 1e-06, + "loss": -0.0366, + "num_tokens": 256754004.0, + "reward": 0.54296875, + "reward_std": 0.11996020376682281, + "rewards/accuracy_reward/mean": 0.048828125, + "rewards/accuracy_reward/std": 0.2157193273305893, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 2915.0, + "completions/max_terminated_length": 2915.0, + "completions/mean_length": 994.041015625, + "completions/mean_terminated_length": 1001.8681030273438, + "completions/min_length": 0.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.8716257481372909, + "grad_norm": 0.1441227104878641, + "learning_rate": 1e-06, + "loss": -0.0242, + "num_tokens": 257343721.0, + "reward": 0.5146484375, + "reward_std": 0.0560164600610733, + "rewards/accuracy_reward/mean": 0.01953125, + "rewards/accuracy_reward/std": 0.1385180652141571, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5521.0, + "completions/max_terminated_length": 5521.0, + "completions/mean_length": 1182.47265625, + "completions/mean_terminated_length": 1182.47265625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.8735800659582265, + "grad_norm": 0.15561883320588535, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 258026875.0, + "reward": 0.5244140625, + "reward_std": 0.06337852776050568, + "rewards/accuracy_reward/mean": 0.025390625, + "rewards/accuracy_reward/std": 0.15746226906776428, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7288.0, + "completions/max_terminated_length": 7288.0, + "completions/mean_length": 1277.51953125, + "completions/mean_terminated_length": 1280.01953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.875534383779162, + "grad_norm": 0.16339822889338884, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 258754837.0, + "reward": 0.5673828125, + "reward_std": 0.09846583753824234, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 5407.0, + "completions/max_terminated_length": 5407.0, + "completions/mean_length": 1107.4453125, + "completions/mean_terminated_length": 1113.9725341796875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.8774887016000977, + "grad_norm": 0.10632321707459978, + "learning_rate": 1e-06, + "loss": -0.0282, + "num_tokens": 259401657.0, + "reward": 0.5400390625, + "reward_std": 0.06761807948350906, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3561.0, + "completions/max_terminated_length": 3561.0, + "completions/mean_length": 1222.599609375, + "completions/mean_terminated_length": 1227.3941650390625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.8794430194210333, + "grad_norm": 0.10892538234873252, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 260121132.0, + "reward": 0.51953125, + "reward_std": 0.05688370391726494, + "rewards/accuracy_reward/mean": 0.021484375, + "rewards/accuracy_reward/std": 0.14513419568538666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4403.0, + "completions/max_terminated_length": 4403.0, + "completions/mean_length": 1116.48046875, + "completions/mean_terminated_length": 1118.6654052734375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.881397337241969, + "grad_norm": 0.16071659186101964, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 260781058.0, + "reward": 0.5341796875, + "reward_std": 0.07790613174438477, + "rewards/accuracy_reward/mean": 0.037109375, + "rewards/accuracy_reward/std": 0.18921469151973724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 8090.0, + "completions/max_terminated_length": 8090.0, + "completions/mean_length": 1182.669921875, + "completions/mean_terminated_length": 1201.4425048828125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.8833516550629046, + "grad_norm": 0.15866915140653665, + "learning_rate": 1e-06, + "loss": -0.0322, + "num_tokens": 261457193.0, + "reward": 0.52734375, + "reward_std": 0.07991015911102295, + "rewards/accuracy_reward/mean": 0.03515625, + "rewards/accuracy_reward/std": 0.1843547374010086, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.984375, + "rewards/soft_format_reward/std": 0.12414088100194931, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 5869.0, + "completions/max_terminated_length": 5869.0, + "completions/mean_length": 939.30078125, + "completions/mean_terminated_length": 944.8369750976562, + "completions/min_length": 0.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.8853059728838403, + "grad_norm": 0.2834832013156954, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 262027827.0, + "reward": 0.6455078125, + "reward_std": 0.15948840975761414, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.35588082671165466, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6611.0, + "completions/max_terminated_length": 6611.0, + "completions/mean_length": 1134.890625, + "completions/mean_terminated_length": 1137.111572265625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8872602907047759, + "grad_norm": 0.291471829751961, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 262694251.0, + "reward": 0.6953125, + "reward_std": 0.18523366749286652, + "rewards/accuracy_reward/mean": 0.197265625, + "rewards/accuracy_reward/std": 0.3983237147331238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6127.0, + "completions/max_terminated_length": 6127.0, + "completions/mean_length": 1059.12109375, + "completions/mean_terminated_length": 1059.12109375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.8892146085257115, + "grad_norm": 0.2114573894858397, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 263320217.0, + "reward": 0.671875, + "reward_std": 0.1525377631187439, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3776407241821289, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4727.0, + "completions/max_terminated_length": 4727.0, + "completions/mean_length": 1134.392578125, + "completions/mean_terminated_length": 1138.8411865234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.8911689263466471, + "grad_norm": 0.261096361150442, + "learning_rate": 1e-06, + "loss": -0.0096, + "num_tokens": 263989634.0, + "reward": 0.689453125, + "reward_std": 0.16820251941680908, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 6432.0, + "completions/max_terminated_length": 6432.0, + "completions/mean_length": 1219.185546875, + "completions/mean_terminated_length": 1228.785400390625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.8931232441675827, + "grad_norm": 0.22638161862707587, + "learning_rate": 1e-06, + "loss": -0.025, + "num_tokens": 264687921.0, + "reward": 0.6845703125, + "reward_std": 0.18046662211418152, + "rewards/accuracy_reward/mean": 0.19140625, + "rewards/accuracy_reward/std": 0.3937928080558777, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 1126.154296875, + "completions/mean_terminated_length": 1126.154296875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.8950775619885184, + "grad_norm": 0.20845463971683803, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 265347696.0, + "reward": 0.60546875, + "reward_std": 0.10685320198535919, + "rewards/accuracy_reward/mean": 0.10546875, + "rewards/accuracy_reward/std": 0.3074568510055542, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 6913.0, + "completions/max_terminated_length": 6913.0, + "completions/mean_length": 1081.65625, + "completions/mean_terminated_length": 1090.1732177734375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.897031879809454, + "grad_norm": 0.289975433876716, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 266016672.0, + "reward": 0.59375, + "reward_std": 0.17172493040561676, + "rewards/accuracy_reward/mean": 0.09765625, + "rewards/accuracy_reward/std": 0.29713961482048035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7313.0, + "completions/max_terminated_length": 7313.0, + "completions/mean_length": 1092.072265625, + "completions/mean_terminated_length": 1098.5089111328125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.8989861976303897, + "grad_norm": 0.2545239766875316, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 266650917.0, + "reward": 0.6796875, + "reward_std": 0.16066578030586243, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 7371.0, + "completions/max_terminated_length": 7371.0, + "completions/mean_length": 1107.66796875, + "completions/mean_terminated_length": 1118.5916748046875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.9009405154513253, + "grad_norm": 0.21722854465129093, + "learning_rate": 1e-06, + "loss": -0.0336, + "num_tokens": 267280523.0, + "reward": 0.5458984375, + "reward_std": 0.10777147114276886, + "rewards/accuracy_reward/mean": 0.05078125, + "rewards/accuracy_reward/std": 0.21976542472839355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3067.0, + "completions/max_terminated_length": 3067.0, + "completions/mean_length": 1133.224609375, + "completions/mean_terminated_length": 1135.4422607421875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.9028948332722609, + "grad_norm": 0.22960432303086395, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 267943198.0, + "reward": 0.5927734375, + "reward_std": 0.14359915256500244, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29176566004753113, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6723.0, + "completions/max_terminated_length": 6723.0, + "completions/mean_length": 1243.921875, + "completions/mean_terminated_length": 1251.2535400390625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.9048491510931965, + "grad_norm": 0.10369050169216122, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 268639270.0, + "reward": 0.5283203125, + "reward_std": 0.07514636963605881, + "rewards/accuracy_reward/mean": 0.033203125, + "rewards/accuracy_reward/std": 0.17934183776378632, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4554.0, + "completions/max_terminated_length": 4554.0, + "completions/mean_length": 1109.744140625, + "completions/mean_terminated_length": 1114.09619140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.9068034689141322, + "grad_norm": 0.19226806294365417, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 269276275.0, + "reward": 0.607421875, + "reward_std": 0.14193780720233917, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31241437792778015, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6798.0, + "completions/max_terminated_length": 6798.0, + "completions/mean_length": 1114.126953125, + "completions/mean_terminated_length": 1116.3072509765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.9087577867350678, + "grad_norm": 0.12107408165475937, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 269919556.0, + "reward": 0.5537109375, + "reward_std": 0.06196758523583412, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 6676.0, + "completions/max_terminated_length": 6676.0, + "completions/mean_length": 1121.8359375, + "completions/mean_terminated_length": 1130.6693115234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.9107121045560034, + "grad_norm": 0.12822558425421798, + "learning_rate": 1e-06, + "loss": -0.024, + "num_tokens": 270577536.0, + "reward": 0.55078125, + "reward_std": 0.08699656277894974, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.2275916188955307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4857.0, + "completions/max_terminated_length": 4857.0, + "completions/mean_length": 1109.7578125, + "completions/mean_terminated_length": 1111.9295654296875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.9126664223769391, + "grad_norm": 0.22502940296498836, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 271210084.0, + "reward": 0.5908203125, + "reward_std": 0.1166892871260643, + "rewards/accuracy_reward/mean": 0.091796875, + "rewards/accuracy_reward/std": 0.289021372795105, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6616.0, + "completions/max_terminated_length": 6616.0, + "completions/mean_length": 1109.375, + "completions/mean_terminated_length": 1111.5460205078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.9146207401978746, + "grad_norm": 0.19036038536273306, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 271852100.0, + "reward": 0.5703125, + "reward_std": 0.07824784517288208, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 5432.0, + "completions/max_terminated_length": 5432.0, + "completions/mean_length": 1133.40234375, + "completions/mean_terminated_length": 1140.08251953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.9165750580188103, + "grad_norm": 0.2086796580655049, + "learning_rate": 1e-06, + "loss": -0.0364, + "num_tokens": 272500898.0, + "reward": 0.6240234375, + "reward_std": 0.13753917813301086, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 5592.0, + "completions/max_terminated_length": 5592.0, + "completions/mean_length": 1123.05078125, + "completions/mean_terminated_length": 1131.8936767578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.9185293758397459, + "grad_norm": 0.18275490783422715, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 273140108.0, + "reward": 0.5380859375, + "reward_std": 0.09426388144493103, + "rewards/accuracy_reward/mean": 0.04296875, + "rewards/accuracy_reward/std": 0.2029850035905838, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7679.0, + "completions/max_terminated_length": 7679.0, + "completions/mean_length": 1269.73828125, + "completions/mean_terminated_length": 1279.7362060546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.9204836936606816, + "grad_norm": 0.22078342247716057, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 273853846.0, + "reward": 0.61328125, + "reward_std": 0.1475171446800232, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 4963.0, + "completions/max_terminated_length": 4963.0, + "completions/mean_length": 1199.77734375, + "completions/mean_terminated_length": 1202.125244140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 521.0, + "epoch": 0.9224380114816172, + "grad_norm": 0.15704116212866737, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 274559652.0, + "reward": 0.5693359375, + "reward_std": 0.09550131857395172, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 5838.0, + "completions/max_terminated_length": 5838.0, + "completions/mean_length": 1262.451171875, + "completions/mean_terminated_length": 1274.9013671875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.9243923293025528, + "grad_norm": 0.17980480412050295, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 275284779.0, + "reward": 0.5361328125, + "reward_std": 0.09638580679893494, + "rewards/accuracy_reward/mean": 0.041015625, + "rewards/accuracy_reward/std": 0.19852031767368317, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7808.0, + "completions/max_terminated_length": 7808.0, + "completions/mean_length": 1149.17578125, + "completions/mean_terminated_length": 1158.224365234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.9263466471234885, + "grad_norm": 0.1992349629521845, + "learning_rate": 1e-06, + "loss": -0.0221, + "num_tokens": 275953669.0, + "reward": 0.6220703125, + "reward_std": 0.1476491093635559, + "rewards/accuracy_reward/mean": 0.126953125, + "rewards/accuracy_reward/std": 0.33324605226516724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5588.0, + "completions/max_terminated_length": 5588.0, + "completions/mean_length": 1126.76953125, + "completions/mean_terminated_length": 1131.1883544921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.928300964944424, + "grad_norm": 0.23204603421178877, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 276613247.0, + "reward": 0.650390625, + "reward_std": 0.16812561452388763, + "rewards/accuracy_reward/mean": 0.15234375, + "rewards/accuracy_reward/std": 0.35970520973205566, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 5957.0, + "completions/max_terminated_length": 5957.0, + "completions/mean_length": 1227.337890625, + "completions/mean_terminated_length": 1232.1510009765625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.9302552827653597, + "grad_norm": 0.27294287151566715, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 277318044.0, + "reward": 0.6416015625, + "reward_std": 0.24193865060806274, + "rewards/accuracy_reward/mean": 0.14453125, + "rewards/accuracy_reward/std": 0.35197147727012634, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6209.0, + "completions/max_terminated_length": 6209.0, + "completions/mean_length": 1375.623046875, + "completions/mean_terminated_length": 1381.0177001953125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.9322096005862953, + "grad_norm": 0.21772984077860952, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 278105531.0, + "reward": 0.6171875, + "reward_std": 0.14019782841205597, + "rewards/accuracy_reward/mean": 0.119140625, + "rewards/accuracy_reward/std": 0.32427072525024414, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 7717.0, + "completions/max_terminated_length": 7717.0, + "completions/mean_length": 1366.416015625, + "completions/mean_terminated_length": 1382.61865234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.934163918407231, + "grad_norm": 0.19457604333092368, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 278893200.0, + "reward": 0.583984375, + "reward_std": 0.15463699400424957, + "rewards/accuracy_reward/mean": 0.08984375, + "rewards/accuracy_reward/std": 0.2862374484539032, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.98828125, + "rewards/soft_format_reward/std": 0.10772226005792618, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3162.0, + "completions/max_terminated_length": 3162.0, + "completions/mean_length": 1184.904296875, + "completions/mean_terminated_length": 1187.2230224609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.9361182362281666, + "grad_norm": 0.2489410884634794, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 279585871.0, + "reward": 0.6630859375, + "reward_std": 0.22684462368488312, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 6448.0, + "completions/max_terminated_length": 6448.0, + "completions/mean_length": 1136.7734375, + "completions/mean_terminated_length": 1145.724365234375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 473.0, + "epoch": 0.9380725540491023, + "grad_norm": 0.22519170418384732, + "learning_rate": 1e-06, + "loss": -0.029, + "num_tokens": 280257579.0, + "reward": 0.669921875, + "reward_std": 0.19624492526054382, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4951.0, + "completions/max_terminated_length": 4951.0, + "completions/mean_length": 1258.939453125, + "completions/mean_terminated_length": 1268.8524169921875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.9400268718700379, + "grad_norm": 0.20889498029833217, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 280979612.0, + "reward": 0.62109375, + "reward_std": 0.16103525459766388, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3310423493385315, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 6499.0, + "completions/max_terminated_length": 6499.0, + "completions/mean_length": 1198.8203125, + "completions/mean_terminated_length": 1205.8861083984375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.9419811896909734, + "grad_norm": 0.29376007446162916, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 281671840.0, + "reward": 0.6962890625, + "reward_std": 0.18663723766803741, + "rewards/accuracy_reward/mean": 0.19921875, + "rewards/accuracy_reward/std": 0.39980348944664, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7867.0, + "completions/max_terminated_length": 7867.0, + "completions/mean_length": 1313.0390625, + "completions/mean_terminated_length": 1320.778076171875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.9439355075119091, + "grad_norm": 0.22821613240829183, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 282421924.0, + "reward": 0.630859375, + "reward_std": 0.20425201952457428, + "rewards/accuracy_reward/mean": 0.134765625, + "rewards/accuracy_reward/std": 0.3418070077896118, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 7278.0, + "completions/max_terminated_length": 7278.0, + "completions/mean_length": 1174.916015625, + "completions/mean_terminated_length": 1177.2152099609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.9458898253328447, + "grad_norm": 0.27341080509739546, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 283104665.0, + "reward": 0.66796875, + "reward_std": 0.17398236691951752, + "rewards/accuracy_reward/mean": 0.169921875, + "rewards/accuracy_reward/std": 0.3759314715862274, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2519.0, + "completions/max_terminated_length": 2519.0, + "completions/mean_length": 1088.140625, + "completions/mean_terminated_length": 1088.140625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.9478441431537804, + "grad_norm": 0.23061259914068383, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 283740657.0, + "reward": 0.6162109375, + "reward_std": 0.12001784890890121, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.32195815443992615, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 6557.0, + "completions/max_terminated_length": 6557.0, + "completions/mean_length": 1118.794921875, + "completions/mean_terminated_length": 1123.182373046875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.949798460974716, + "grad_norm": 0.264425075792234, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 284395560.0, + "reward": 0.62890625, + "reward_std": 0.2028954178094864, + "rewards/accuracy_reward/mean": 0.130859375, + "rewards/accuracy_reward/std": 0.33757632970809937, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3279.0, + "completions/max_terminated_length": 3279.0, + "completions/mean_length": 1179.318359375, + "completions/mean_terminated_length": 1181.626220703125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.9517527787956517, + "grad_norm": 0.31717929910714326, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 285080043.0, + "reward": 0.7119140625, + "reward_std": 0.22875383496284485, + "rewards/accuracy_reward/mean": 0.212890625, + "rewards/accuracy_reward/std": 0.409751296043396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 7884.0, + "completions/max_terminated_length": 7884.0, + "completions/mean_length": 1169.51171875, + "completions/mean_terminated_length": 1174.09814453125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.9537070966165873, + "grad_norm": 0.5537675676011398, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 285768545.0, + "reward": 0.701171875, + "reward_std": 0.1790769249200821, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.4027182459831238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 6214.0, + "completions/max_terminated_length": 6214.0, + "completions/mean_length": 1253.82421875, + "completions/mean_terminated_length": 1268.6917724609375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.955661414437523, + "grad_norm": 0.21672168845733863, + "learning_rate": 1e-06, + "loss": -0.0396, + "num_tokens": 286487319.0, + "reward": 0.5673828125, + "reward_std": 0.15228576958179474, + "rewards/accuracy_reward/mean": 0.07421875, + "rewards/accuracy_reward/std": 0.2623828947544098, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 3512.0, + "completions/max_terminated_length": 3512.0, + "completions/mean_length": 1265.763671875, + "completions/mean_terminated_length": 1268.24072265625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.9576157322584585, + "grad_norm": 0.21825817777844952, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 287228622.0, + "reward": 0.5712890625, + "reward_std": 0.12500068545341492, + "rewards/accuracy_reward/mean": 0.072265625, + "rewards/accuracy_reward/std": 0.2591804563999176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 6764.0, + "completions/max_terminated_length": 6764.0, + "completions/mean_length": 1100.55078125, + "completions/mean_terminated_length": 1102.7044677734375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.9595700500793941, + "grad_norm": 0.26981424906714707, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 287881064.0, + "reward": 0.6787109375, + "reward_std": 0.17242230474948883, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 4314.0, + "completions/max_terminated_length": 4314.0, + "completions/mean_length": 974.759765625, + "completions/mean_terminated_length": 988.2713012695312, + "completions/min_length": 0.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.9615243679003298, + "grad_norm": 0.25418093972205297, + "learning_rate": 1e-06, + "loss": -0.0383, + "num_tokens": 288453325.0, + "reward": 0.5927734375, + "reward_std": 0.10644464939832687, + "rewards/accuracy_reward/mean": 0.099609375, + "rewards/accuracy_reward/std": 0.29977133870124817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.986328125, + "rewards/soft_format_reward/std": 0.1162383034825325, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 7165.0, + "completions/max_terminated_length": 7165.0, + "completions/mean_length": 1085.6875, + "completions/mean_terminated_length": 1092.08642578125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.9634786857212654, + "grad_norm": 0.22961059379008847, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 289088413.0, + "reward": 0.5830078125, + "reward_std": 0.0966651439666748, + "rewards/accuracy_reward/mean": 0.087890625, + "rewards/accuracy_reward/std": 0.2834126651287079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6114.0, + "completions/max_terminated_length": 6114.0, + "completions/mean_length": 1095.9296875, + "completions/mean_terminated_length": 1095.9296875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.9654330035422011, + "grad_norm": 0.3959456334980767, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 289743017.0, + "reward": 0.6640625, + "reward_std": 0.2380717247724533, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 2783.0, + "completions/max_terminated_length": 2783.0, + "completions/mean_length": 1019.509765625, + "completions/mean_terminated_length": 1021.5048828125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.9673873213631367, + "grad_norm": 0.3549015296751003, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 290343774.0, + "reward": 0.7060546875, + "reward_std": 0.1921609342098236, + "rewards/accuracy_reward/mean": 0.20703125, + "rewards/accuracy_reward/std": 0.40557438135147095, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5443.0, + "completions/max_terminated_length": 5443.0, + "completions/mean_length": 1049.34765625, + "completions/mean_terminated_length": 1049.34765625, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.9693416391840723, + "grad_norm": 0.39516055614547824, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 290954032.0, + "reward": 0.650390625, + "reward_std": 0.18701601028442383, + "rewards/accuracy_reward/mean": 0.150390625, + "rewards/accuracy_reward/std": 0.35780346393585205, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2623.0, + "completions/max_terminated_length": 2623.0, + "completions/mean_length": 1058.138671875, + "completions/mean_terminated_length": 1062.288330078125, + "completions/min_length": 0.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.9712959570050079, + "grad_norm": 0.28345709848572537, + "learning_rate": 1e-06, + "loss": -0.0176, + "num_tokens": 291574567.0, + "reward": 0.576171875, + "reward_std": 0.1418503224849701, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.26863065361976624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4092.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 1051.822265625, + "completions/mean_terminated_length": 1055.9471435546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.9732502748259436, + "grad_norm": 0.37754859374958094, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 292183612.0, + "reward": 0.630859375, + "reward_std": 0.20899051427841187, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.33970388770103455, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5324.0, + "completions/max_terminated_length": 5324.0, + "completions/mean_length": 1034.296875, + "completions/mean_terminated_length": 1034.296875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.9752045926468792, + "grad_norm": 0.41257797903651733, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 292781140.0, + "reward": 0.68359375, + "reward_std": 0.23354387283325195, + "rewards/accuracy_reward/mean": 0.18359375, + "rewards/accuracy_reward/std": 0.3875311613082886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3891.0, + "completions/max_terminated_length": 3891.0, + "completions/mean_length": 1073.869140625, + "completions/mean_terminated_length": 1078.0804443359375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.9771589104678148, + "grad_norm": 0.3929758993563004, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 293398593.0, + "reward": 0.662109375, + "reward_std": 0.20519289374351501, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.37069445848464966, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 8123.0, + "completions/max_terminated_length": 8123.0, + "completions/mean_length": 1050.818359375, + "completions/mean_terminated_length": 1054.9393310546875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.9791132282887505, + "grad_norm": 0.3829472452735823, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 294012708.0, + "reward": 0.626953125, + "reward_std": 0.1886095255613327, + "rewards/accuracy_reward/mean": 0.12890625, + "rewards/accuracy_reward/std": 0.33542385697364807, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.99609375, + "rewards/soft_format_reward/std": 0.06243881583213806, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5735.0, + "completions/max_terminated_length": 5735.0, + "completions/mean_length": 1181.84375, + "completions/mean_terminated_length": 1184.156494140625, + "completions/min_length": 0.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.981067546109686, + "grad_norm": 0.3822957738619265, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 294693236.0, + "reward": 0.6103515625, + "reward_std": 0.1959744244813919, + "rewards/accuracy_reward/mean": 0.111328125, + "rewards/accuracy_reward/std": 0.31484565138816833, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 5692.0, + "completions/max_terminated_length": 5692.0, + "completions/mean_length": 1000.248046875, + "completions/mean_terminated_length": 1002.2054443359375, + "completions/min_length": 0.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.9830218639306217, + "grad_norm": 0.3830384798519429, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 295283891.0, + "reward": 0.6767578125, + "reward_std": 0.2175215184688568, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.38430243730545044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.994140625, + "rewards/soft_format_reward/std": 0.07639661431312561, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7631.0, + "completions/max_terminated_length": 7631.0, + "completions/mean_length": 969.873046875, + "completions/mean_terminated_length": 977.5098266601562, + "completions/min_length": 0.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.9849761817515573, + "grad_norm": 0.42393804037907257, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 295844178.0, + "reward": 0.6728515625, + "reward_std": 0.24096624553203583, + "rewards/accuracy_reward/mean": 0.177734375, + "rewards/accuracy_reward/std": 0.3826628625392914, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.990234375, + "rewards/soft_format_reward/std": 0.09843364357948303, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7657.0, + "completions/max_terminated_length": 7657.0, + "completions/mean_length": 1027.501953125, + "completions/mean_terminated_length": 1035.592529296875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.986930499572493, + "grad_norm": 0.4524177106995229, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 296438803.0, + "reward": 0.75, + "reward_std": 0.2938448190689087, + "rewards/accuracy_reward/mean": 0.25390625, + "rewards/accuracy_reward/std": 0.43567025661468506, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.9921875, + "rewards/soft_format_reward/std": 0.08812850713729858, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3266.0, + "completions/max_terminated_length": 3266.0, + "completions/mean_length": 1056.494140625, + "completions/mean_terminated_length": 1056.494140625, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.9888848173934286, + "grad_norm": 0.3619438551252798, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 297054192.0, + "reward": 0.7265625, + "reward_std": 0.2644432485103607, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4190165400505066, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3480.0, + "completions/max_terminated_length": 3480.0, + "completions/mean_length": 1094.634765625, + "completions/mean_terminated_length": 1094.634765625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.9908391352143643, + "grad_norm": 0.37532222818593464, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 297679637.0, + "reward": 0.65625, + "reward_std": 0.22193288803100586, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36344730854034424, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3836.0, + "completions/max_terminated_length": 3836.0, + "completions/mean_length": 1010.59375, + "completions/mean_terminated_length": 1010.59375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.9927934530352999, + "grad_norm": 0.375592765657, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 298263669.0, + "reward": 0.673828125, + "reward_std": 0.2310568392276764, + "rewards/accuracy_reward/mean": 0.173828125, + "rewards/accuracy_reward/std": 0.3793322443962097, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4340.0, + "completions/max_terminated_length": 4340.0, + "completions/mean_length": 912.21484375, + "completions/mean_terminated_length": 912.21484375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.9947477708562354, + "grad_norm": 0.44595836157819063, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 298794275.0, + "reward": 0.791015625, + "reward_std": 0.28581932187080383, + "rewards/accuracy_reward/mean": 0.291015625, + "rewards/accuracy_reward/std": 0.45467492938041687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 1.0, + "rewards/soft_format_reward/std": 0.0, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 2273.0, + "completions/max_terminated_length": 2273.0, + "completions/mean_length": 947.17578125, + "completions/mean_terminated_length": 949.0293579101562, + "completions/min_length": 0.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.9967020886771711, + "grad_norm": 0.4558458171092683, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 299343965.0, + "reward": 0.6611328125, + "reward_std": 0.23155611753463745, + "rewards/accuracy_reward/mean": 0.162109375, + "rewards/accuracy_reward/std": 0.3689115643501282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0019920318725099584, + "completions/max_length": 2272.0, + "completions/max_terminated_length": 2272.0, + "completions/mean_length": 814.7310791015625, + "completions/mean_terminated_length": 816.3572998046875, + "completions/min_length": 0.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.9986564064981067, + "grad_norm": 0.49546221374173544, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 299824814.0, + "reward": 0.7470703125, + "reward_std": 0.26824861764907837, + "rewards/accuracy_reward/mean": 0.248046875, + "rewards/accuracy_reward/std": 0.4323015511035919, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/soft_format_reward/mean": 0.998046875, + "rewards/soft_format_reward/std": 0.04419417306780815, + "step": 511 + }, + { + "epoch": 0.9986564064981067, + "step": 511, + "total_flos": 0.0, + "train_loss": 0.006096165258858404, + "train_runtime": 48643.0759, + "train_samples_per_second": 0.337, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1, + "max_steps": 511, + "num_input_tokens_seen": 299824814, + "num_train_epochs": 1, + "save_steps": 52, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}